From 1bdc63851d2329f3fa1f07027adc862d89f24df9 Mon Sep 17 00:00:00 2001
From: aplio <ryo.091219@gmail.com>
Date: Thu, 30 Jan 2025 21:51:11 +0900
Subject: [PATCH 1/2] feature. add feat to modify metadata via dataset api

---
 .../service_api/dataset/document.py           |  98 +++++++++
 api/services/dataset_service.py               |  15 ++
 .../knowledge_entities/knowledge_entities.py  |   6 +
 .../datasets/template/template.en.mdx         | 199 +++++++++++++++++
 .../datasets/template/template.zh.mdx         | 204 ++++++++++++++++++
 5 files changed, 522 insertions(+)

diff --git a/api/controllers/service_api/dataset/document.py b/api/controllers/service_api/dataset/document.py
index 2e148dd84c05da..b4c3a4c6075781 100644
--- a/api/controllers/service_api/dataset/document.py
+++ b/api/controllers/service_api/dataset/document.py
@@ -18,6 +18,7 @@
 from controllers.service_api.dataset.error import (
     ArchivedDocumentImmutableError,
     DocumentIndexingError,
+    InvalidMetadataError,
 )
 from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
 from core.errors.error import ProviderTokenNotInitError
@@ -50,6 +51,9 @@ def post(self, tenant_id, dataset_id):
             "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
         )
         parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
+        parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
+        parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
+
         args = parser.parse_args()
         dataset_id = str(dataset_id)
         tenant_id = str(tenant_id)
@@ -61,6 +65,28 @@ def post(self, tenant_id, dataset_id):
         if not dataset.indexing_technique and not args["indexing_technique"]:
             raise ValueError("indexing_technique is required.")
 
+        # Validate metadata if provided
+        if args.get("doc_type") or args.get("doc_metadata"):
+            if not args.get("doc_type") or not args.get("doc_metadata"):
+                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
+
+            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
+                raise InvalidMetadataError(
+                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
+                )
+
+            if not isinstance(args["doc_metadata"], dict):
+                raise InvalidMetadataError("doc_metadata must be a dictionary")
+
+            # Validate metadata schema based on doc_type
+            if args["doc_type"] != "others":
+                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
+                for key, value in args["doc_metadata"].items():
+                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
+                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
+            # set to MetaDataConfig
+            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
+
         text = args.get("text")
         name = args.get("name")
         if text is None or name is None:
@@ -107,6 +133,8 @@ def post(self, tenant_id, dataset_id, document_id):
             "doc_language", type=str, default="English", required=False, nullable=False, location="json"
         )
         parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
+        parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
+        parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
         args = parser.parse_args()
         dataset_id = str(dataset_id)
         tenant_id = str(tenant_id)
@@ -115,6 +143,29 @@ def post(self, tenant_id, dataset_id, document_id):
         if not dataset:
             raise ValueError("Dataset is not exist.")
 
+        # Validate metadata if provided
+        if args.get("doc_type") or args.get("doc_metadata"):
+            if not args.get("doc_type") or not args.get("doc_metadata"):
+                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
+
+            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
+                raise InvalidMetadataError(
+                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
+                )
+
+            if not isinstance(args["doc_metadata"], dict):
+                raise InvalidMetadataError("doc_metadata must be a dictionary")
+
+            # Validate metadata schema based on doc_type
+            if args["doc_type"] != "others":
+                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
+                for key, value in args["doc_metadata"].items():
+                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
+                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
+
+            # set to MetaDataConfig
+            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
+
         if args["text"]:
             text = args.get("text")
             name = args.get("name")
@@ -161,6 +212,30 @@ def post(self, tenant_id, dataset_id):
             args["doc_form"] = "text_model"
         if "doc_language" not in args:
             args["doc_language"] = "English"
+
+        # Validate metadata if provided
+        if args.get("doc_type") or args.get("doc_metadata"):
+            if not args.get("doc_type") or not args.get("doc_metadata"):
+                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
+
+            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
+                raise InvalidMetadataError(
+                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
+                )
+
+            if not isinstance(args["doc_metadata"], dict):
+                raise InvalidMetadataError("doc_metadata must be a dictionary")
+
+            # Validate metadata schema based on doc_type
+            if args["doc_type"] != "others":
+                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
+                for key, value in args["doc_metadata"].items():
+                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
+                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
+
+            # set to MetaDataConfig
+            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
+
         # get dataset info
         dataset_id = str(dataset_id)
         tenant_id = str(tenant_id)
@@ -228,6 +303,29 @@ def post(self, tenant_id, dataset_id, document_id):
         if "doc_language" not in args:
             args["doc_language"] = "English"
 
+        # Validate metadata if provided
+        if args.get("doc_type") or args.get("doc_metadata"):
+            if not args.get("doc_type") or not args.get("doc_metadata"):
+                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
+
+            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
+                raise InvalidMetadataError(
+                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
+                )
+
+            if not isinstance(args["doc_metadata"], dict):
+                raise InvalidMetadataError("doc_metadata must be a dictionary")
+
+            # Validate metadata schema based on doc_type
+            if args["doc_type"] != "others":
+                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
+                for key, value in args["doc_metadata"].items():
+                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
+                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
+
+            # set to MetaDataConfig
+            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
+
         # get dataset info
         dataset_id = str(dataset_id)
         tenant_id = str(tenant_id)
diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py
index c4059337367c1a..38025b5213aaa3 100644
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -42,6 +42,7 @@
 from services.entities.knowledge_entities.knowledge_entities import (
     ChildChunkUpdateArgs,
     KnowledgeConfig,
+    MetaDataConfig,
     RerankingModel,
     RetrievalModel,
     SegmentUpdateArgs,
@@ -894,6 +895,9 @@ def save_document_with_dataset_id(
                                 document.data_source_info = json.dumps(data_source_info)
                                 document.batch = batch
                                 document.indexing_status = "waiting"
+                                if knowledge_config.metadata:
+                                    document.doc_type = knowledge_config.metadata.doc_type
+                                    document.metadata = knowledge_config.metadata.doc_metadata
                                 db.session.add(document)
                                 documents.append(document)
                                 duplicate_document_ids.append(document.id)
@@ -910,6 +914,7 @@ def save_document_with_dataset_id(
                             account,
                             file_name,
                             batch,
+                            knowledge_config.metadata,
                         )
                         db.session.add(document)
                         db.session.flush()
@@ -965,6 +970,7 @@ def save_document_with_dataset_id(
                                     account,
                                     page.page_name,
                                     batch,
+                                    knowledge_config.metadata,
                                 )
                                 db.session.add(document)
                                 db.session.flush()
@@ -1005,6 +1011,7 @@ def save_document_with_dataset_id(
                             account,
                             document_name,
                             batch,
+                            knowledge_config.metadata,
                         )
                         db.session.add(document)
                         db.session.flush()
@@ -1042,6 +1049,7 @@ def build_document(
         account: Account,
         name: str,
         batch: str,
+        metadata: Optional[MetaDataConfig] = None,
     ):
         document = Document(
             tenant_id=dataset.tenant_id,
@@ -1057,6 +1065,9 @@ def build_document(
             doc_form=document_form,
             doc_language=document_language,
         )
+        if metadata is not None:
+            document.doc_metadata = metadata.doc_metadata
+            document.doc_type = metadata.doc_type
         return document
 
     @staticmethod
@@ -1169,6 +1180,10 @@ def update_document_with_dataset_id(
         # update document name
         if document_data.name:
             document.name = document_data.name
+        # update doc_type and doc_metadata if provided
+        if document_data.metadata is not None:
+            document.doc_metadata = document_data.metadata.doc_type
+            document.doc_type = document_data.metadata.doc_type
         # update document to be waiting
         document.indexing_status = "waiting"
         document.completed_at = None
diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py
index 8d6a246b6428d0..f14c5b513a8687 100644
--- a/api/services/entities/knowledge_entities/knowledge_entities.py
+++ b/api/services/entities/knowledge_entities/knowledge_entities.py
@@ -93,6 +93,11 @@ class RetrievalModel(BaseModel):
     score_threshold: Optional[float] = None
 
 
+class MetaDataConfig(BaseModel):
+    doc_type: str
+    doc_metadata: dict
+
+
 class KnowledgeConfig(BaseModel):
     original_document_id: Optional[str] = None
     duplicate: bool = True
@@ -105,6 +110,7 @@ class KnowledgeConfig(BaseModel):
     embedding_model: Optional[str] = None
     embedding_model_provider: Optional[str] = None
     name: Optional[str] = None
+    metadata: Optional[MetaDataConfig] = None
 
 
 class SegmentUpdateArgs(BaseModel):
diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx
index 3fa22a1620ed9e..ac57e3aef2233b 100644
--- a/web/app/(commonLayout)/datasets/template/template.en.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.en.mdx
@@ -47,6 +47,44 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
       <Property name='text' type='string' key='text'>
         Document content
       </Property>
+      <Property name='doc_type' type='string' key='doc_type'>
+        Type of document (optional):
+          - <code>book</code> Book
+          - <code>web_page</code> Web page
+          - <code>paper</code> Academic paper/article 
+          - <code>social_media_post</code> Social media post
+          - <code>wikipedia_entry</code> Wikipedia entry
+          - <code>personal_document</code> Personal document
+          - <code>business_document</code> Business document
+          - <code>im_chat_log</code> Chat log
+          - <code>synced_from_notion</code> Notion document
+          - <code>synced_from_github</code> GitHub document
+          - <code>others</code> Other document types
+      </Property>
+      <Property name='doc_metadata' type='object' key='doc_metadata'>
+        Document metadata (required if doc_type is provided). Fields vary by doc_type:
+          For <code>book</code>:
+          - <code>title</code> Book title 
+          - <code>language</code> Book language
+          - <code>author</code> Book author
+          - <code>publisher</code> Publisher name
+          - <code>publication_date</code> Publication date
+          - <code>isbn</code> ISBN number
+          - <code>category</code> Book category
+
+          For <code>web_page</code>:
+          - <code>title</code> Page title
+          - <code>url</code> Page URL
+          - <code>language</code> Page language
+          - <code>publish_date</code> Publish date
+          - <code>author/publisher</code> Author or publisher
+          - <code>topic/keywords</code> Topic or keywords
+          - <code>description</code> Page description
+
+          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
+
+          For doc_type "others", any valid JSON object is accepted
+      </Property>
       <Property name='indexing_technique' type='string' key='indexing_technique'>
         Index mode
           - <code>high_quality</code> High quality: embedding using embedding model, built as vector database index
@@ -195,6 +233,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
           - <code>hierarchical_model</code> Parent-child mode
           - <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
 
+        - <code>doc_type</code> Type of document (optional)
+          - <code>book</code> Book
+            Document records a book or publication
+          - <code>web_page</code> Web page 
+            Document records web page content
+          - <code>paper</code> Academic paper/article
+            Document records academic paper or research article
+          - <code>social_media_post</code> Social media post
+            Content from social media posts
+          - <code>wikipedia_entry</code> Wikipedia entry
+            Content from Wikipedia entries
+          - <code>personal_document</code> Personal document
+            Documents related to personal content
+          - <code>business_document</code> Business document
+            Documents related to business content
+          - <code>im_chat_log</code> Chat log
+            Records of instant messaging chats
+          - <code>synced_from_notion</code> Notion document
+            Documents synchronized from Notion
+          - <code>synced_from_github</code> GitHub document
+            Documents synchronized from GitHub
+          - <code>others</code> Other document types
+            Other document types not listed above
+
+        - <code>doc_metadata</code> Document metadata (required if doc_type is provided)
+          Fields vary by doc_type:
+
+          For <code>book</code>:
+          - <code>title</code> Book title
+            Title of the book
+          - <code>language</code> Book language
+            Language of the book
+          - <code>author</code> Book author
+            Author of the book
+          - <code>publisher</code> Publisher name
+            Name of the publishing house
+          - <code>publication_date</code> Publication date
+            Date when the book was published
+          - <code>isbn</code> ISBN number
+            International Standard Book Number
+          - <code>category</code> Book category
+            Category or genre of the book
+
+          For <code>web_page</code>:
+          - <code>title</code> Page title
+            Title of the web page
+          - <code>url</code> Page URL
+            URL address of the web page
+          - <code>language</code> Page language
+            Language of the web page
+          - <code>publish_date</code> Publish date
+            Date when the web page was published
+          - <code>author/publisher</code> Author or publisher
+            Author or publisher of the web page
+          - <code>topic/keywords</code> Topic or keywords
+            Topics or keywords of the web page
+          - <code>description</code> Page description
+            Description of the web page content
+
+          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
+          For doc_type "others", any valid JSON object is accepted
+
         - <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code>
 
         - <code>process_rule</code> Processing rules
@@ -307,6 +407,44 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
       <Property name='description' type='string' key='description'>
         Knowledge description (optional)
       </Property>
+      <Property name='doc_type' type='string' key='doc_type'>
+        Type of document (optional):
+          - <code>book</code> Book
+          - <code>web_page</code> Web page
+          - <code>paper</code> Academic paper/article 
+          - <code>social_media_post</code> Social media post
+          - <code>wikipedia_entry</code> Wikipedia entry
+          - <code>personal_document</code> Personal document
+          - <code>business_document</code> Business document
+          - <code>im_chat_log</code> Chat log
+          - <code>synced_from_notion</code> Notion document
+          - <code>synced_from_github</code> GitHub document
+          - <code>others</code> Other document types
+      </Property>
+      <Property name='doc_metadata' type='object' key='doc_metadata'>
+        Document metadata (required if doc_type is provided). Fields vary by doc_type:
+          For <code>book</code>:
+          - <code>title</code> Book title 
+          - <code>language</code> Book language
+          - <code>author</code> Book author
+          - <code>publisher</code> Publisher name
+          - <code>publication_date</code> Publication date
+          - <code>isbn</code> ISBN number
+          - <code>category</code> Book category
+
+          For <code>web_page</code>:
+          - <code>title</code> Page title
+          - <code>url</code> Page URL
+          - <code>language</code> Page language
+          - <code>publish_date</code> Publish date
+          - <code>author/publisher</code> Author or publisher
+          - <code>topic/keywords</code> Topic or keywords
+          - <code>description</code> Page description
+
+          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
+
+          For doc_type "others", any valid JSON object is accepted
+      </Property>
       <Property name='indexing_technique' type='string' key='indexing_technique'>
         Index technique (optional)
           - <code>high_quality</code> High quality
@@ -624,6 +762,67 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
               - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
               - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
               - <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional)
+            - <code>doc_type</code> Type of document (optional)
+              - <code>book</code> Book
+                Document records a book or publication
+              - <code>web_page</code> Web page 
+                Document records web page content
+              - <code>paper</code> Academic paper/article
+                Document records academic paper or research article
+              - <code>social_media_post</code> Social media post
+                Content from social media posts
+              - <code>wikipedia_entry</code> Wikipedia entry
+                Content from Wikipedia entries
+              - <code>personal_document</code> Personal document
+                Documents related to personal content
+              - <code>business_document</code> Business document
+                Documents related to business content
+              - <code>im_chat_log</code> Chat log
+                Records of instant messaging chats
+              - <code>synced_from_notion</code> Notion document
+                Documents synchronized from Notion
+              - <code>synced_from_github</code> GitHub document
+                Documents synchronized from GitHub
+              - <code>others</code> Other document types
+                Other document types not listed above
+
+            - <code>doc_metadata</code> Document metadata (required if doc_type is provided)
+              Fields vary by doc_type:
+
+              For <code>book</code>:
+              - <code>title</code> Book title
+                Title of the book
+              - <code>language</code> Book language
+                Language of the book
+              - <code>author</code> Book author
+                Author of the book
+              - <code>publisher</code> Publisher name
+                Name of the publishing house
+              - <code>publication_date</code> Publication date
+                Date when the book was published
+              - <code>isbn</code> ISBN number
+                International Standard Book Number
+              - <code>category</code> Book category
+                Category or genre of the book
+
+              For <code>web_page</code>:
+              - <code>title</code> Page title
+                Title of the web page
+              - <code>url</code> Page URL
+                URL address of the web page
+              - <code>language</code> Page language
+                Language of the web page
+              - <code>publish_date</code> Publish date
+                Date when the web page was published
+              - <code>author/publisher</code> Author or publisher
+                Author or publisher of the web page
+              - <code>topic/keywords</code> Topic or keywords
+                Topics or keywords of the web page
+              - <code>description</code> Page description
+                Description of the web page content
+
+              Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
+              For doc_type "others", any valid JSON object is accepted
       </Property>
     </Properties>
   </Col>
diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx
index 334591743f931b..7bd617b55bc731 100644
--- a/web/app/(commonLayout)/datasets/template/template.zh.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx
@@ -47,6 +47,46 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
       <Property name='text' type='string' key='text'>
         文档内容
       </Property>
+      <Property name='doc_type' type='string' key='doc_type'>
+        文档类型（选填）
+          - <code>book</code> 图书 Book
+          - <code>web_page</code> 网页 Web page
+          - <code>paper</code> 学术论文/文章 Academic paper/article 
+          - <code>social_media_post</code> 社交媒体帖子 Social media post
+          - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
+          - <code>personal_document</code> 个人文档 Personal document
+          - <code>business_document</code> 商业文档 Business document
+          - <code>im_chat_log</code> 即时通讯记录 Chat log
+          - <code>synced_from_notion</code> Notion同步文档 Notion document
+          - <code>synced_from_github</code> GitHub同步文档 GitHub document
+          - <code>others</code> 其他文档类型 Other document types
+      </Property>
+      <Property name='doc_metadata' type='object' key='doc_metadata'>
+      
+        文档元数据（如提供文档类型则必填）。字段因文档类型而异：
+          
+          针对图书 For <code>book</code>:
+          - <code>title</code> 书名 Book title 
+          - <code>language</code> 图书语言 Book language
+          - <code>author</code> 作者 Book author
+          - <code>publisher</code> 出版社 Publisher name
+          - <code>publication_date</code> 出版日期 Publication date
+          - <code>isbn</code> ISBN号码 ISBN number
+          - <code>category</code> 图书分类 Book category
+
+          针对网页 For <code>web_page</code>:
+          - <code>title</code> 页面标题 Page title
+          - <code>url</code> 页面网址 Page URL
+          - <code>language</code> 页面语言 Page language
+          - <code>publish_date</code> 发布日期 Publish date
+          - <code>author/publisher</code> 作者/发布者 Author or publisher
+          - <code>topic/keywords</code> 主题/关键词 Topic or keywords
+          - <code>description</code> 页面描述 Page description
+
+          请查看 api/services/dataset_service.py 了解各文档类型所需字段的详细信息。
+
+          针对"其他"类型文档，接受任何有效的JSON对象
+      </Property>
       <Property name='indexing_technique' type='string' key='indexing_technique'>
         索引方式
           - <code>high_quality</code> 高质量：使用  embedding 模型进行嵌入，构建为向量数据库索引
@@ -194,6 +234,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
           - <code>text_model</code> text 文档直接 embedding，经济模式默认为该模式
           - <code>hierarchical_model</code> parent-child 模式
           - <code>qa_model</code> Q&A 模式：为分片文档生成 Q&A 对，然后对问题进行 embedding
+        - <code>doc_type</code> 文档类型（选填）Type of document (optional)
+          - <code>book</code> 图书
+            文档记录一本书籍或出版物
+          - <code>web_page</code> 网页
+            网页内容的文档记录
+          - <code>paper</code> 学术论文/文章
+            学术论文或研究文章的记录
+          - <code>social_media_post</code> 社交媒体帖子
+            社交媒体上的帖子内容
+          - <code>wikipedia_entry</code> 维基百科条目
+            维基百科的词条内容
+          - <code>personal_document</code> 个人文档
+            个人相关的文档记录
+          - <code>business_document</code> 商业文档
+            商业相关的文档记录
+          - <code>im_chat_log</code> 即时通讯记录
+            即时通讯的聊天记录
+          - <code>synced_from_notion</code> Notion同步文档
+            从Notion同步的文档内容
+          - <code>synced_from_github</code> GitHub同步文档
+            从GitHub同步的文档内容
+          - <code>others</code> 其他文档类型
+            其他未列出的文档类型
+
+        - <code>doc_metadata</code> 文档元数据（如提供文档类型则必填
+          字段因文档类型而异
+
+          针对图书类型 For <code>book</code>:
+          - <code>title</code> 书名
+            书籍的标题
+          - <code>language</code> 图书语言
+            书籍的语言
+          - <code>author</code> 作者
+            书籍的作者
+          - <code>publisher</code> 出版社
+            出版社的名称
+          - <code>publication_date</code> 出版日期
+            书籍的出版日期
+          - <code>isbn</code> ISBN号码
+            书籍的ISBN编号
+          - <code>category</code> 图书分类
+            书籍的分类类别
+
+          针对网页类型 For <code>web_page</code>:
+          - <code>title</code> 页面标题
+            网页的标题
+          - <code>url</code> 页面网址
+            网页的URL地址
+          - <code>language</code> 页面语言
+            网页的语言
+          - <code>publish_date</code> 发布日期
+            网页的发布日期
+          - <code>author/publisher</code> 作者/发布者
+            网页的作者或发布者
+          - <code>topic/keywords</code> 主题/关键词
+            网页的主题或关键词
+          - <code>description</code> 页面描述
+            网页的描述信息
+
+          请查看 api/services/dataset_service.py 了解各文档类型所需字段的详细信息。
+
+          针对"其他"类型文档，接受任何有效的JSON对象
 
         - <code>doc_language</code> 在 Q&A 模式下，指定文档的语言，例如：<code>English</code>、<code>Chinese</code>
 
@@ -504,6 +606,46 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
       <Property name='text' type='string' key='text'>
         文档内容（选填）
       </Property>
+      <Property name='doc_type' type='string' key='doc_type'>
+        文档类型（选填）
+          - <code>book</code> 图书 Book
+          - <code>web_page</code> 网页 Web page
+          - <code>paper</code> 学术论文/文章 Academic paper/article 
+          - <code>social_media_post</code> 社交媒体帖子 Social media post
+          - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
+          - <code>personal_document</code> 个人文档 Personal document
+          - <code>business_document</code> 商业文档 Business document
+          - <code>im_chat_log</code> 即时通讯记录 Chat log
+          - <code>synced_from_notion</code> Notion同步文档 Notion document
+          - <code>synced_from_github</code> GitHub同步文档 GitHub document
+          - <code>others</code> 其他文档类型 Other document types
+      </Property>
+      <Property name='doc_metadata' type='object' key='doc_metadata'>
+      
+        文档元数据（如提供文档类型则必填）。字段因文档类型而异：
+          
+          针对图书 For <code>book</code>:
+          - <code>title</code> 书名 Book title 
+          - <code>language</code> 图书语言 Book language
+          - <code>author</code> 作者 Book author
+          - <code>publisher</code> 出版社 Publisher name
+          - <code>publication_date</code> 出版日期 Publication date
+          - <code>isbn</code> ISBN号码 ISBN number
+          - <code>category</code> 图书分类 Book category
+
+          针对网页 For <code>web_page</code>:
+          - <code>title</code> 页面标题 Page title
+          - <code>url</code> 页面网址 Page URL
+          - <code>language</code> 页面语言 Page language
+          - <code>publish_date</code> 发布日期 Publish date
+          - <code>author/publisher</code> 作者/发布者 Author or publisher
+          - <code>topic/keywords</code> 主题/关键词 Topic or keywords
+          - <code>description</code> 页面描述 Page description
+
+          请查看 api/services/dataset_service.py 了解各文档类型所需字段的详细信息。
+
+          针对"其他"类型文档，接受任何有效的JSON对象
+      </Property>
       <Property name='process_rule' type='object' key='process_rule'>
         处理规则（选填）
           - <code>mode</code> (string) 清洗、分段模式 ，automatic 自动 / custom 自定义
@@ -624,6 +766,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
               - <code>separator</code> 分段标识符，目前仅允许设置一个分隔符。默认为 <code>***</code>
               - <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度
               - <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时，段与段之间存在一定的重叠部分（选填）
+            - <code>doc_type</code> 文档类型（选填）Type of document (optional)
+              - <code>book</code> 图书
+                文档记录一本书籍或出版物
+              - <code>web_page</code> 网页
+                网页内容的文档记录
+              - <code>paper</code> 学术论文/文章
+                学术论文或研究文章的记录
+              - <code>social_media_post</code> 社交媒体帖子
+                社交媒体上的帖子内容
+              - <code>wikipedia_entry</code> 维基百科条目
+                维基百科的词条内容
+              - <code>personal_document</code> 个人文档
+                个人相关的文档记录
+              - <code>business_document</code> 商业文档
+                商业相关的文档记录
+              - <code>im_chat_log</code> 即时通讯记录
+                即时通讯的聊天记录
+              - <code>synced_from_notion</code> Notion同步文档
+                从Notion同步的文档内容
+              - <code>synced_from_github</code> GitHub同步文档
+                从GitHub同步的文档内容
+              - <code>others</code> 其他文档类型
+                其他未列出的文档类型
+
+            - <code>doc_metadata</code> 文档元数据（如提供文档类型则必填
+              字段因文档类型而异
+
+              针对图书类型 For <code>book</code>:
+              - <code>title</code> 书名
+                书籍的标题
+              - <code>language</code> 图书语言
+                书籍的语言
+              - <code>author</code> 作者
+                书籍的作者
+              - <code>publisher</code> 出版社
+                出版社的名称
+              - <code>publication_date</code> 出版日期
+                书籍的出版日期
+              - <code>isbn</code> ISBN号码
+                书籍的ISBN编号
+              - <code>category</code> 图书分类
+                书籍的分类类别
+
+              针对网页类型 For <code>web_page</code>:
+              - <code>title</code> 页面标题
+                网页的标题
+              - <code>url</code> 页面网址
+                网页的URL地址
+              - <code>language</code> 页面语言
+                网页的语言
+              - <code>publish_date</code> 发布日期
+                网页的发布日期
+              - <code>author/publisher</code> 作者/发布者
+                网页的作者或发布者
+              - <code>topic/keywords</code> 主题/关键词
+                网页的主题或关键词
+              - <code>description</code> 页面描述
+                网页的描述信息
+
+              请查看 api/services/dataset_service.py 了解各文档类型所需字段的详细信息。
+
+              针对"其他"类型文档，接受任何有效的JSON对象
       </Property>
     </Properties>
   </Col>

From ef39389a480e93ffeb7ddb2783bed683e931a0f7 Mon Sep 17 00:00:00 2001
From: aplio <ryo.091219@gmail.com>
Date: Thu, 30 Jan 2025 22:15:12 +0900
Subject: [PATCH 2/2] fix. add link to zh too

---
 web/app/(commonLayout)/datasets/template/template.zh.mdx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx
index 7bd617b55bc731..0e5857c4464d8a 100644
--- a/web/app/(commonLayout)/datasets/template/template.zh.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx
@@ -83,7 +83,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
           - <code>topic/keywords</code> 主题/关键词 Topic or keywords
           - <code>description</code> 页面描述 Page description
 
-          请查看 api/services/dataset_service.py 了解各文档类型所需字段的详细信息。
+          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
 
           针对"其他"类型文档，接受任何有效的JSON对象
       </Property>
@@ -293,7 +293,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
           - <code>description</code> 页面描述
             网页的描述信息
 
-          请查看 api/services/dataset_service.py 了解各文档类型所需字段的详细信息。
+          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
 
           针对"其他"类型文档，接受任何有效的JSON对象
 
@@ -642,7 +642,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
           - <code>topic/keywords</code> 主题/关键词 Topic or keywords
           - <code>description</code> 页面描述 Page description
 
-          请查看 api/services/dataset_service.py 了解各文档类型所需字段的详细信息。
+          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
 
           针对"其他"类型文档，接受任何有效的JSON对象
       </Property>
@@ -825,7 +825,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
               - <code>description</code> 页面描述
                 网页的描述信息
 
-              请查看 api/services/dataset_service.py 了解各文档类型所需字段的详细信息。
+              请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
 
               针对"其他"类型文档，接受任何有效的JSON对象
       </Property>