diff --git a/api/controllers/service_api/dataset/document.py b/api/controllers/service_api/dataset/document.py
index 2e148dd84c05da..b4c3a4c6075781 100644
--- a/api/controllers/service_api/dataset/document.py
+++ b/api/controllers/service_api/dataset/document.py
@@ -18,6 +18,7 @@
from controllers.service_api.dataset.error import (
ArchivedDocumentImmutableError,
DocumentIndexingError,
+ InvalidMetadataError,
)
from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
from core.errors.error import ProviderTokenNotInitError
@@ -50,6 +51,9 @@ def post(self, tenant_id, dataset_id):
"indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
)
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
+ parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
+ parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
+
args = parser.parse_args()
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)
@@ -61,6 +65,28 @@ def post(self, tenant_id, dataset_id):
if not dataset.indexing_technique and not args["indexing_technique"]:
raise ValueError("indexing_technique is required.")
+ # Validate metadata if provided
+ if args.get("doc_type") or args.get("doc_metadata"):
+ if not args.get("doc_type") or not args.get("doc_metadata"):
+ raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
+
+ if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
+ raise InvalidMetadataError(
+ "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
+ )
+
+ if not isinstance(args["doc_metadata"], dict):
+ raise InvalidMetadataError("doc_metadata must be a dictionary")
+
+ # Validate metadata schema based on doc_type
+ if args["doc_type"] != "others":
+ metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
+ for key, value in args["doc_metadata"].items():
+ if key in metadata_schema and not isinstance(value, metadata_schema[key]):
+ raise InvalidMetadataError(f"Invalid type for metadata field {key}")
+ # set to MetaDataConfig
+ args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
+
text = args.get("text")
name = args.get("name")
if text is None or name is None:
@@ -107,6 +133,8 @@ def post(self, tenant_id, dataset_id, document_id):
"doc_language", type=str, default="English", required=False, nullable=False, location="json"
)
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
+ parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
+ parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
args = parser.parse_args()
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)
@@ -115,6 +143,29 @@ def post(self, tenant_id, dataset_id, document_id):
if not dataset:
raise ValueError("Dataset is not exist.")
+ # Validate metadata if provided
+ if args.get("doc_type") or args.get("doc_metadata"):
+ if not args.get("doc_type") or not args.get("doc_metadata"):
+ raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
+
+ if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
+ raise InvalidMetadataError(
+ "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
+ )
+
+ if not isinstance(args["doc_metadata"], dict):
+ raise InvalidMetadataError("doc_metadata must be a dictionary")
+
+ # Validate metadata schema based on doc_type
+ if args["doc_type"] != "others":
+ metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
+ for key, value in args["doc_metadata"].items():
+ if key in metadata_schema and not isinstance(value, metadata_schema[key]):
+ raise InvalidMetadataError(f"Invalid type for metadata field {key}")
+
+ # set to MetaDataConfig
+ args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
+
if args["text"]:
text = args.get("text")
name = args.get("name")
@@ -161,6 +212,30 @@ def post(self, tenant_id, dataset_id):
args["doc_form"] = "text_model"
if "doc_language" not in args:
args["doc_language"] = "English"
+
+ # Validate metadata if provided
+ if args.get("doc_type") or args.get("doc_metadata"):
+ if not args.get("doc_type") or not args.get("doc_metadata"):
+ raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
+
+ if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
+ raise InvalidMetadataError(
+ "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
+ )
+
+ if not isinstance(args["doc_metadata"], dict):
+ raise InvalidMetadataError("doc_metadata must be a dictionary")
+
+ # Validate metadata schema based on doc_type
+ if args["doc_type"] != "others":
+ metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
+ for key, value in args["doc_metadata"].items():
+ if key in metadata_schema and not isinstance(value, metadata_schema[key]):
+ raise InvalidMetadataError(f"Invalid type for metadata field {key}")
+
+ # set to MetaDataConfig
+ args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
+
# get dataset info
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)
@@ -228,6 +303,29 @@ def post(self, tenant_id, dataset_id, document_id):
if "doc_language" not in args:
args["doc_language"] = "English"
+ # Validate metadata if provided
+ if args.get("doc_type") or args.get("doc_metadata"):
+ if not args.get("doc_type") or not args.get("doc_metadata"):
+ raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
+
+ if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
+ raise InvalidMetadataError(
+ "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
+ )
+
+ if not isinstance(args["doc_metadata"], dict):
+ raise InvalidMetadataError("doc_metadata must be a dictionary")
+
+ # Validate metadata schema based on doc_type
+ if args["doc_type"] != "others":
+ metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
+ for key, value in args["doc_metadata"].items():
+ if key in metadata_schema and not isinstance(value, metadata_schema[key]):
+ raise InvalidMetadataError(f"Invalid type for metadata field {key}")
+
+ # set to MetaDataConfig
+ args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
+
# get dataset info
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)
diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py
index c4059337367c1a..38025b5213aaa3 100644
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -42,6 +42,7 @@
from services.entities.knowledge_entities.knowledge_entities import (
ChildChunkUpdateArgs,
KnowledgeConfig,
+ MetaDataConfig,
RerankingModel,
RetrievalModel,
SegmentUpdateArgs,
@@ -894,6 +895,9 @@ def save_document_with_dataset_id(
document.data_source_info = json.dumps(data_source_info)
document.batch = batch
document.indexing_status = "waiting"
+ if knowledge_config.metadata:
+ document.doc_type = knowledge_config.metadata.doc_type
+ document.metadata = knowledge_config.metadata.doc_metadata
db.session.add(document)
documents.append(document)
duplicate_document_ids.append(document.id)
@@ -910,6 +914,7 @@ def save_document_with_dataset_id(
account,
file_name,
batch,
+ knowledge_config.metadata,
)
db.session.add(document)
db.session.flush()
@@ -965,6 +970,7 @@ def save_document_with_dataset_id(
account,
page.page_name,
batch,
+ knowledge_config.metadata,
)
db.session.add(document)
db.session.flush()
@@ -1005,6 +1011,7 @@ def save_document_with_dataset_id(
account,
document_name,
batch,
+ knowledge_config.metadata,
)
db.session.add(document)
db.session.flush()
@@ -1042,6 +1049,7 @@ def build_document(
account: Account,
name: str,
batch: str,
+ metadata: Optional[MetaDataConfig] = None,
):
document = Document(
tenant_id=dataset.tenant_id,
@@ -1057,6 +1065,9 @@ def build_document(
doc_form=document_form,
doc_language=document_language,
)
+ if metadata is not None:
+ document.doc_metadata = metadata.doc_metadata
+ document.doc_type = metadata.doc_type
return document
@staticmethod
@@ -1169,6 +1180,10 @@ def update_document_with_dataset_id(
# update document name
if document_data.name:
document.name = document_data.name
+ # update doc_type and doc_metadata if provided
+ if document_data.metadata is not None:
+ document.doc_metadata = document_data.metadata.doc_type
+ document.doc_type = document_data.metadata.doc_type
# update document to be waiting
document.indexing_status = "waiting"
document.completed_at = None
diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py
index 8d6a246b6428d0..f14c5b513a8687 100644
--- a/api/services/entities/knowledge_entities/knowledge_entities.py
+++ b/api/services/entities/knowledge_entities/knowledge_entities.py
@@ -93,6 +93,11 @@ class RetrievalModel(BaseModel):
score_threshold: Optional[float] = None
+class MetaDataConfig(BaseModel):
+ doc_type: str
+ doc_metadata: dict
+
+
class KnowledgeConfig(BaseModel):
original_document_id: Optional[str] = None
duplicate: bool = True
@@ -105,6 +110,7 @@ class KnowledgeConfig(BaseModel):
embedding_model: Optional[str] = None
embedding_model_provider: Optional[str] = None
name: Optional[str] = None
+ metadata: Optional[MetaDataConfig] = None
class SegmentUpdateArgs(BaseModel):
diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx
index 3fa22a1620ed9e..ac57e3aef2233b 100644
--- a/web/app/(commonLayout)/datasets/template/template.en.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.en.mdx
@@ -47,6 +47,44 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
Document content
+
+ Type of document (optional):
+ - book
Book
+ - web_page
Web page
+ - paper
Academic paper/article
+ - social_media_post
Social media post
+ - wikipedia_entry
Wikipedia entry
+ - personal_document
Personal document
+ - business_document
Business document
+ - im_chat_log
Chat log
+ - synced_from_notion
Notion document
+ - synced_from_github
GitHub document
+ - others
Other document types
+
+
+ Document metadata (required if doc_type is provided). Fields vary by doc_type:
+ For book
:
+ - title
Book title
+ - language
Book language
+ - author
Book author
+ - publisher
Publisher name
+ - publication_date
Publication date
+ - isbn
ISBN number
+ - category
Book category
+
+ For web_page
:
+ - title
Page title
+ - url
Page URL
+ - language
Page language
+ - publish_date
Publish date
+ - author/publisher
Author or publisher
+ - topic/keywords
Topic or keywords
+ - description
Page description
+
+ Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
+
+ For doc_type "others", any valid JSON object is accepted
+
Index mode
- high_quality
High quality: embedding using embedding model, built as vector database index
@@ -195,6 +233,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- hierarchical_model
Parent-child mode
- qa_model
Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
+ - doc_type
Type of document (optional)
+ - book
Book
+ Document records a book or publication
+ - web_page
Web page
+ Document records web page content
+ - paper
Academic paper/article
+ Document records academic paper or research article
+ - social_media_post
Social media post
+ Content from social media posts
+ - wikipedia_entry
Wikipedia entry
+ Content from Wikipedia entries
+ - personal_document
Personal document
+ Documents related to personal content
+ - business_document
Business document
+ Documents related to business content
+ - im_chat_log
Chat log
+ Records of instant messaging chats
+ - synced_from_notion
Notion document
+ Documents synchronized from Notion
+ - synced_from_github
GitHub document
+ Documents synchronized from GitHub
+ - others
Other document types
+ Other document types not listed above
+
+ - doc_metadata
Document metadata (required if doc_type is provided)
+ Fields vary by doc_type:
+
+ For book
:
+ - title
Book title
+ Title of the book
+ - language
Book language
+ Language of the book
+ - author
Book author
+ Author of the book
+ - publisher
Publisher name
+ Name of the publishing house
+ - publication_date
Publication date
+ Date when the book was published
+ - isbn
ISBN number
+ International Standard Book Number
+ - category
Book category
+ Category or genre of the book
+
+ For web_page
:
+ - title
Page title
+ Title of the web page
+ - url
Page URL
+ URL address of the web page
+ - language
Page language
+ Language of the web page
+ - publish_date
Publish date
+ Date when the web page was published
+ - author/publisher
Author or publisher
+ Author or publisher of the web page
+ - topic/keywords
Topic or keywords
+ Topics or keywords of the web page
+ - description
Page description
+ Description of the web page content
+
+ Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
+ For doc_type "others", any valid JSON object is accepted
+
- doc_language
In Q&A mode, specify the language of the document, for example: English
, Chinese
- process_rule
Processing rules
@@ -307,6 +407,44 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
Knowledge description (optional)
+
+ Type of document (optional):
+ - book
Book
+ - web_page
Web page
+ - paper
Academic paper/article
+ - social_media_post
Social media post
+ - wikipedia_entry
Wikipedia entry
+ - personal_document
Personal document
+ - business_document
Business document
+ - im_chat_log
Chat log
+ - synced_from_notion
Notion document
+ - synced_from_github
GitHub document
+ - others
Other document types
+
+
+ Document metadata (required if doc_type is provided). Fields vary by doc_type:
+ For book
:
+ - title
Book title
+ - language
Book language
+ - author
Book author
+ - publisher
Publisher name
+ - publication_date
Publication date
+ - isbn
ISBN number
+ - category
Book category
+
+ For web_page
:
+ - title
Page title
+ - url
Page URL
+ - language
Page language
+ - publish_date
Publish date
+ - author/publisher
Author or publisher
+ - topic/keywords
Topic or keywords
+ - description
Page description
+
+ Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
+
+ For doc_type "others", any valid JSON object is accepted
+
Index technique (optional)
- high_quality
High quality
@@ -624,6 +762,67 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- separator
Segmentation identifier. Currently, only one delimiter is allowed. The default is ***
- max_tokens
The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
- chunk_overlap
Define the overlap between adjacent chunks (optional)
+ - doc_type
Type of document (optional)
+ - book
Book
+ Document records a book or publication
+ - web_page
Web page
+ Document records web page content
+ - paper
Academic paper/article
+ Document records academic paper or research article
+ - social_media_post
Social media post
+ Content from social media posts
+ - wikipedia_entry
Wikipedia entry
+ Content from Wikipedia entries
+ - personal_document
Personal document
+ Documents related to personal content
+ - business_document
Business document
+ Documents related to business content
+ - im_chat_log
Chat log
+ Records of instant messaging chats
+ - synced_from_notion
Notion document
+ Documents synchronized from Notion
+ - synced_from_github
GitHub document
+ Documents synchronized from GitHub
+ - others
Other document types
+ Other document types not listed above
+
+ - doc_metadata
Document metadata (required if doc_type is provided)
+ Fields vary by doc_type:
+
+ For book
:
+ - title
Book title
+ Title of the book
+ - language
Book language
+ Language of the book
+ - author
Book author
+ Author of the book
+ - publisher
Publisher name
+ Name of the publishing house
+ - publication_date
Publication date
+ Date when the book was published
+ - isbn
ISBN number
+ International Standard Book Number
+ - category
Book category
+ Category or genre of the book
+
+ For web_page
:
+ - title
Page title
+ Title of the web page
+ - url
Page URL
+ URL address of the web page
+ - language
Page language
+ Language of the web page
+ - publish_date
Publish date
+ Date when the web page was published
+ - author/publisher
Author or publisher
+ Author or publisher of the web page
+ - topic/keywords
Topic or keywords
+ Topics or keywords of the web page
+ - description
Page description
+ Description of the web page content
+
+ Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
+ For doc_type "others", any valid JSON object is accepted
diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx
index 334591743f931b..0e5857c4464d8a 100644
--- a/web/app/(commonLayout)/datasets/template/template.zh.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx
@@ -47,6 +47,46 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
文档内容
+
+ 文档类型(选填)
+ - book
图书 Book
+ - web_page
网页 Web page
+ - paper
学术论文/文章 Academic paper/article
+ - social_media_post
社交媒体帖子 Social media post
+ - wikipedia_entry
维基百科条目 Wikipedia entry
+ - personal_document
个人文档 Personal document
+ - business_document
商业文档 Business document
+ - im_chat_log
即时通讯记录 Chat log
+ - synced_from_notion
Notion同步文档 Notion document
+ - synced_from_github
GitHub同步文档 GitHub document
+ - others
其他文档类型 Other document types
+
+
+
+ 文档元数据(如提供文档类型则必填)。字段因文档类型而异:
+
+ 针对图书 For book
:
+ - title
书名 Book title
+ - language
图书语言 Book language
+ - author
作者 Book author
+ - publisher
出版社 Publisher name
+ - publication_date
出版日期 Publication date
+ - isbn
ISBN号码 ISBN number
+ - category
图书分类 Book category
+
+ 针对网页 For web_page
:
+ - title
页面标题 Page title
+ - url
页面网址 Page URL
+ - language
页面语言 Page language
+ - publish_date
发布日期 Publish date
+ - author/publisher
作者/发布者 Author or publisher
+ - topic/keywords
主题/关键词 Topic or keywords
+ - description
页面描述 Page description
+
+ 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
+
+ 针对"其他"类型文档,接受任何有效的JSON对象
+
索引方式
- high_quality
高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引
@@ -194,6 +234,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- text_model
text 文档直接 embedding,经济模式默认为该模式
- hierarchical_model
parent-child 模式
- qa_model
Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding
+ - doc_type
文档类型(选填)Type of document (optional)
+ - book
图书
+ 文档记录一本书籍或出版物
+ - web_page
网页
+ 网页内容的文档记录
+ - paper
学术论文/文章
+ 学术论文或研究文章的记录
+ - social_media_post
社交媒体帖子
+ 社交媒体上的帖子内容
+ - wikipedia_entry
维基百科条目
+ 维基百科的词条内容
+ - personal_document
个人文档
+ 个人相关的文档记录
+ - business_document
商业文档
+ 商业相关的文档记录
+ - im_chat_log
即时通讯记录
+ 即时通讯的聊天记录
+ - synced_from_notion
Notion同步文档
+ 从Notion同步的文档内容
+ - synced_from_github
GitHub同步文档
+ 从GitHub同步的文档内容
+ - others
其他文档类型
+ 其他未列出的文档类型
+
+ - doc_metadata
文档元数据(如提供文档类型则必填
+ 字段因文档类型而异
+
+ 针对图书类型 For book
:
+ - title
书名
+ 书籍的标题
+ - language
图书语言
+ 书籍的语言
+ - author
作者
+ 书籍的作者
+ - publisher
出版社
+ 出版社的名称
+ - publication_date
出版日期
+ 书籍的出版日期
+ - isbn
ISBN号码
+ 书籍的ISBN编号
+ - category
图书分类
+ 书籍的分类类别
+
+ 针对网页类型 For web_page
:
+ - title
页面标题
+ 网页的标题
+ - url
页面网址
+ 网页的URL地址
+ - language
页面语言
+ 网页的语言
+ - publish_date
发布日期
+ 网页的发布日期
+ - author/publisher
作者/发布者
+ 网页的作者或发布者
+ - topic/keywords
主题/关键词
+ 网页的主题或关键词
+ - description
页面描述
+ 网页的描述信息
+
+ 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
+
+ 针对"其他"类型文档,接受任何有效的JSON对象
- doc_language
在 Q&A 模式下,指定文档的语言,例如:English
、Chinese
@@ -504,6 +606,46 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
文档内容(选填)
+
+ 文档类型(选填)
+ - book
图书 Book
+ - web_page
网页 Web page
+ - paper
学术论文/文章 Academic paper/article
+ - social_media_post
社交媒体帖子 Social media post
+ - wikipedia_entry
维基百科条目 Wikipedia entry
+ - personal_document
个人文档 Personal document
+ - business_document
商业文档 Business document
+ - im_chat_log
即时通讯记录 Chat log
+ - synced_from_notion
Notion同步文档 Notion document
+ - synced_from_github
GitHub同步文档 GitHub document
+ - others
其他文档类型 Other document types
+
+
+
+ 文档元数据(如提供文档类型则必填)。字段因文档类型而异:
+
+ 针对图书 For book
:
+ - title
书名 Book title
+ - language
图书语言 Book language
+ - author
作者 Book author
+ - publisher
出版社 Publisher name
+ - publication_date
出版日期 Publication date
+ - isbn
ISBN号码 ISBN number
+ - category
图书分类 Book category
+
+ 针对网页 For web_page
:
+ - title
页面标题 Page title
+ - url
页面网址 Page URL
+ - language
页面语言 Page language
+ - publish_date
发布日期 Publish date
+ - author/publisher
作者/发布者 Author or publisher
+ - topic/keywords
主题/关键词 Topic or keywords
+ - description
页面描述 Page description
+
+ 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
+
+ 针对"其他"类型文档,接受任何有效的JSON对象
+
处理规则(选填)
- mode
(string) 清洗、分段模式 ,automatic 自动 / custom 自定义
@@ -624,6 +766,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- separator
分段标识符,目前仅允许设置一个分隔符。默认为 ***
- max_tokens
最大长度 (token) 需要校验小于父级的长度
- chunk_overlap
分段重叠指的是在对数据进行分段时,段与段之间存在一定的重叠部分(选填)
+ - doc_type
文档类型(选填)Type of document (optional)
+ - book
图书
+ 文档记录一本书籍或出版物
+ - web_page
网页
+ 网页内容的文档记录
+ - paper
学术论文/文章
+ 学术论文或研究文章的记录
+ - social_media_post
社交媒体帖子
+ 社交媒体上的帖子内容
+ - wikipedia_entry
维基百科条目
+ 维基百科的词条内容
+ - personal_document
个人文档
+ 个人相关的文档记录
+ - business_document
商业文档
+ 商业相关的文档记录
+ - im_chat_log
即时通讯记录
+ 即时通讯的聊天记录
+ - synced_from_notion
Notion同步文档
+ 从Notion同步的文档内容
+ - synced_from_github
GitHub同步文档
+ 从GitHub同步的文档内容
+ - others
其他文档类型
+ 其他未列出的文档类型
+
+ - doc_metadata
文档元数据(如提供文档类型则必填
+ 字段因文档类型而异
+
+ 针对图书类型 For book
:
+ - title
书名
+ 书籍的标题
+ - language
图书语言
+ 书籍的语言
+ - author
作者
+ 书籍的作者
+ - publisher
出版社
+ 出版社的名称
+ - publication_date
出版日期
+ 书籍的出版日期
+ - isbn
ISBN号码
+ 书籍的ISBN编号
+ - category
图书分类
+ 书籍的分类类别
+
+ 针对网页类型 For web_page
:
+ - title
页面标题
+ 网页的标题
+ - url
页面网址
+ 网页的URL地址
+ - language
页面语言
+ 网页的语言
+ - publish_date
发布日期
+ 网页的发布日期
+ - author/publisher
作者/发布者
+ 网页的作者或发布者
+ - topic/keywords
主题/关键词
+ 网页的主题或关键词
+ - description
页面描述
+ 网页的描述信息
+
+ 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
+
+ 针对"其他"类型文档,接受任何有效的JSON对象