-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml2md-converter (1).py
138 lines (116 loc) · 5.15 KB
/
html2md-converter (1).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import re
from bs4 import BeautifulSoup
import html
def convert_ai_message_to_md(content):
"""
将AI消息的HTML内容转换为Markdown格式,保持格式化
"""
soup = BeautifulSoup(content, 'html.parser')
md_lines = []
for element in soup.children:
if not element.name: # 跳过空文本节点
continue
if element.name == 'p':
# 普通段落文本
text = element.get_text().strip()
md_lines.append(f"{text}\n")
elif element.name == 'ol':
# 有序列表,先加一个空行
md_lines.append("")
# 获取所有列表项
items = element.find_all('li', recursive=False)
for i, item in enumerate(items, 1):
text = item.get_text().strip()
# 使用实际的序号
md_lines.append(f"{i}. {text}")
md_lines.append("") # 列表后加空行
elif element.name == 'ul':
# 无序列表,先加一个空行
md_lines.append("")
items = element.find_all('li', recursive=False)
for item in items:
text = item.get_text().strip()
md_lines.append(f"* {text}")
md_lines.append("") # 列表后加空行
return "\n".join(md_lines)
def convert_html_to_md(html_content):
"""
将HTML格式的聊天记录转换为Markdown格式
"""
# 对话标识模式
user_start = r'<div class="flex shrink-0 items-center justify-center rounded-full font-bold h-7 w-7 text-\[12px \] bg-accent-pro-100 text-oncolor-100">([^<]+)</div></div><div data-testid="user-message"'
ai_start = r'<div class="grid-cols-1 grid gap-2\.5 \[&_>_\*\]:min-w-0"><p class="whitespace-pre-wrap break-words">'
ai_end = r'</div></div></div><div class="absolute -bottom-0 -right-1\.5" style="transform: none;">'
# 获取用户名
username_match = re.search(user_start, html_content)
if not username_match:
return "无法识别对话内容:未找到用户发言"
username = username_match.group(1)
# 分割对话内容
conversations = []
current_pos = 0
# 确保第一条消息是用户发言
first_user = re.search(user_start, html_content)
first_ai = re.search(ai_start, html_content)
if not first_user or (first_ai and first_ai.start() < first_user.start()):
return "对话格式错误:第一条消息不是用户发言"
while current_pos < len(html_content):
# 查找下一个用户或AI的发言开始位置
user_match = re.search(user_start, html_content[current_pos:])
ai_match = re.search(ai_start, html_content[current_pos:])
# 如果找不到更多的对话了,就结束
if not user_match and not ai_match:
break
if not ai_match or (user_match and user_match.start() < ai_match.start()):
# 处理用户发言
start_pos = current_pos + user_match.start()
content_start = start_pos + user_match.end()
# 用户发言到AI发言开始或文件结束
next_ai = re.search(ai_start, html_content[content_start:])
if next_ai:
content_end = content_start + next_ai.start()
else:
content_end = len(html_content)
# 提取内容,用户内容保持纯文本处理
content = html_content[content_start:content_end]
conversations.append(("user", content))
current_pos = content_end
else:
# 处理AI发言
start_pos = current_pos + ai_match.start()
content_start = start_pos + ai_match.end()
# 查找AI发言结束标志
end_match = re.search(ai_end, html_content[content_start:])
if end_match:
content_end = content_start + end_match.start()
else:
content_end = len(html_content)
# 提取内容
content = html_content[content_start:content_end]
conversations.append(("ai", content))
current_pos = content_end if not end_match else content_start + end_match.end()
# 转换为Markdown格式
md_content = []
for speaker, content in conversations:
if speaker == "user":
# 用户消息简单处理
soup = BeautifulSoup(content, 'html.parser')
clean_text = soup.get_text().strip()
md_content.append(f"\n## {username}\n")
md_content.append(clean_text)
else:
# AI消息需要保持格式
md_content.append(f"\n## AI\n")
md_content.append(convert_ai_message_to_md(content))
return "\n".join(md_content)
def main():
# 读取HTML文件
with open('input.html', 'r', encoding='utf-8') as f:
html_content = f.read()
# 转换内容
md_content = convert_html_to_md(html_content)
# 写入Markdown文件
with open('output.md', 'w', encoding='utf-8') as f:
f.write(md_content)
if __name__ == "__main__":
main()