-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconverting_html_chat_logs.py
337 lines (282 loc) · 11.9 KB
/
converting_html_chat_logs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
import sys
import re
from bs4 import BeautifulSoup
import html
def process_inline_code(element):
"""
处理内联代码块,将code标签转换为markdown的内联代码格式
"""
result = []
for content in element.contents:
if content.name == 'code':
# 使用markdown的内联代码格式 `code`
result.append(f'`{content.get_text()}`')
elif isinstance(content, str):
result.append(content)
else:
result.append(str(content.string) if content.string else '')
return ''.join(result)
def extract_code_block(element):
"""
从pre标签中提取代码块内容并转换为markdown格式
特别处理"示例输出"类型的代码块
"""
# 首先尝试查找 text-text-300 类的div来获取语言
lang_div = element.find('div', class_='text-text-300')
# 检查是否为示例输出代码块
code_element = element.find('code')
if code_element and code_element.get('class') is None:
# 这可能是一个示例输出代码块,直接作为普通文本处理
text = code_element.get_text().strip()
if text:
return text + "\n"
return ''
# 处理正常代码块
if not lang_div:
# 查找所有 div,检查text content
all_divs = element.find_all('div')
for div in all_divs:
text_content = div.get_text().strip().lower()
if text_content in ['python', 'bash', 'javascript', 'html', 'css', 'java', 'cpp', 'c++', 'c', 'markdown']:
lang_div = div
break
language = lang_div.get_text().strip() if lang_div else ''
# 查找代码内容
code_div = element.find('div', class_='code-block__code')
if not code_div:
code_div = element
if not code_div:
return ''
# 提取代码内容
code_content = ''
code_element = code_div.find('code')
if code_element:
# 获取所有文本节点
for item in code_element.strings:
code_content += item
# 确保代码内容两端没有多余的空行
code_content = code_content.strip()
# 特殊处理:如果代码内容看起来像普通文本(没有特殊格式和缩进),则作为普通文本处理
lines = code_content.split('\n')
if len(lines) == 1 and not any(char in code_content for char in '{}[]()=><'):
return code_content + "\n"
# 按markdown代码块格式返回
if language:
return f"```{language}\n{code_content}\n```\n"
else:
return f"```\n{code_content}\n```\n"
def process_list_items(element, depth=0, start_number=1):
"""
递归处理列表及其嵌套列表
"""
result = []
items = element.find_all('li', recursive=False)
is_ordered = element.name == 'ol'
indent = " " * depth # 使用4个空格作为缩进
current_number = start_number
for item in items:
# 处理当前列表项的文本(包括可能的内联代码)
content = process_inline_code(item)
# 处理嵌套列表前的内容
main_content = content
nested_lists = item.find_all(['ul', 'ol'], recursive=False)
if nested_lists:
# 如果有嵌套列表,需要分离主要内容和嵌套列表
first_text = item.find(string=True, recursive=False)
if first_text and first_text.parent:
main_content = process_inline_code(first_text.parent)
# 添加列表项标记
if is_ordered:
result.append(f"{indent}{current_number}. {main_content}")
current_number += 1
else:
result.append(f"{indent}* {main_content}")
# 处理列表项中的代码块
code_blocks = item.find_all('pre', recursive=True)
if code_blocks:
for code_block in code_blocks:
# 确保代码块前有空行
result.append("")
# 提取并添加代码块
result.append(extract_code_block(code_block))
# 处理嵌套列表
for nested_list in nested_lists:
# 递归处理嵌套列表
nested_items = process_list_items(nested_list, depth + 1)
result.extend(nested_items)
return result
def convert_ai_message_to_md(content):
"""
将AI消息的HTML内容转换为Markdown格式,保持格式化
"""
soup = BeautifulSoup(content, 'html.parser')
md_lines = []
current_list_number = 1
# 首先尝试找到消息的主体内容
main_content = soup.find('div', class_='grid-cols-1')
if not main_content:
main_content = soup
def should_process_element(elem):
if not elem.name:
return False
# 跳过图片相关元素
if elem.find('img') or elem.get('data-testid', '').startswith('image'):
return False
# 跳过按钮和相关UI元素
if elem.name == 'div' and elem.get('class'):
skip_classes = ['font-styrene', 'relative', 'absolute', '-bottom-0', 'flex items-center']
return not any(cls in elem.get('class', []) for cls in skip_classes)
return elem.name in ['p', 'pre', 'ol', 'ul', 'div']
# 获取需要处理的元素
elements = []
for elem in main_content.children:
if should_process_element(elem):
if elem.name == 'div':
# 检查div是否为代码块容器
if not elem.find('pre') and not elem.find('code'):
continue
elements.append(elem)
last_was_code = False
for i, element in enumerate(elements):
if element.name == 'pre':
text = extract_code_block(element)
if not text.startswith('```'):
# 这是普通文本
if last_was_code:
md_lines.append('')
md_lines.append(text)
else:
# 这是代码块
if md_lines and md_lines[-1].strip():
md_lines.append('')
md_lines.append(text)
last_was_code = True
continue
elif element.name == 'p':
text = process_inline_code(element)
if text:
if last_was_code:
md_lines.append('')
md_lines.append(f"{text}\n")
last_was_code = False
elif element.name in ['ol', 'ul']:
if last_was_code:
md_lines.append('')
if md_lines and md_lines[-1].strip():
md_lines.append("")
start_number = element.get('start')
if start_number:
current_list_number = int(start_number)
list_items = process_list_items(element, depth=0, start_number=current_list_number)
md_lines.extend(list_items)
if element.name == 'ol':
current_list_number += len(element.find_all('li', recursive=False))
if i < len(elements) - 1 and elements[i + 1].name not in ['ol', 'ul']:
md_lines.append("")
last_was_code = False
# 清理多余空行
result = []
prev_line = ""
for line in md_lines:
if line.strip() or (prev_line.strip() and not line.strip()):
result.append(line)
prev_line = line
# 移除最后的空行
while result and not result[-1].strip():
result.pop()
return "\n".join(result)
def convert_html_to_md(html_content):
"""
将HTML格式的聊天记录转换为Markdown格式
"""
# 对话标识模式
user_start = r'</div></div><div data-testid="user-message"'
user_end = r'<div class="absolute -bottom-0 -right-1.5"'
ai_start = r'<div class="grid-cols-1 grid gap-2\.5\s*\[&_>_\*\]:min-w-0">'
# 分割对话内容
conversations = []
current_pos = 0
while current_pos < len(html_content):
# 查找下一个用户或AI的发言开始位置
user_match = re.search(user_start, html_content[current_pos:])
ai_match = re.search(ai_start, html_content[current_pos:])
# 如果找不到更多的对话了,就结束
if not user_match and not ai_match:
break
if not ai_match or (user_match and user_match.start() < ai_match.start()):
# 处理用户发言
if user_match:
start_pos = current_pos + user_match.start()
content_start = start_pos + len(user_start)
# 找到用户发言的结束位置(下一个AI发言开始或用户发言结束标记)
next_ai = re.search(ai_start, html_content[content_start:])
end_mark = re.search(user_end, html_content[content_start:])
if end_mark and (not next_ai or end_mark.start() < next_ai.start()):
content_end = content_start + end_mark.start()
elif next_ai:
content_end = content_start + next_ai.start()
else:
content_end = len(html_content)
# 提取用户消息内容
content = html_content[content_start:content_end]
# 使用BeautifulSoup提取文本
soup = BeautifulSoup(content, 'html.parser')
user_text = ' '.join(p.get_text().strip() for p in soup.find_all('p'))
if user_text:
conversations.append(("user", user_text))
current_pos = content_end
else:
# 处理AI发言
start_pos = current_pos + ai_match.start()
content_start = start_pos
# 查找下一个用户发言作为当前AI发言的结束点
next_user = re.search(user_start, html_content[content_start:])
if next_user:
content_end = content_start + next_user.start()
else:
content_end = len(html_content)
# 提取内容
content = html_content[content_start:content_end]
conversations.append(("ai", content))
current_pos = content_end
# 转换为Markdown格式
md_content = []
for speaker, content in conversations:
if speaker == "user":
# 用户消息已经是纯文本了
md_content.append(f"\n## User\n")
md_content.append(content)
else:
# AI消息需要保持格式
md_content.append(f"\n## AI\n")
md_content.append(convert_ai_message_to_md(content))
return "\n".join(md_content)
def main():
# 检查命令行参数
if len(sys.argv) != 2:
print("Usage: python script.py input.html")
print("Example: python script.py chat.html")
sys.exit(1)
# 获取输入文件路径
input_file = sys.argv[1]
# 检查文件是否以.html结尾
if not input_file.lower().endswith('.html'):
print("Error: Input file must be an HTML file (.html)")
sys.exit(1)
# 生成输出文件名(替换.html为.md)
output_file = input_file[:-5] + '.md' # 去掉.html后加上.md
try:
# 读取HTML文件
with open(input_file, 'r', encoding='utf-8') as f:
html_content = f.read()
# 转换内容
md_content = convert_html_to_md(html_content)
# 写入Markdown文件
with open(output_file, 'w', encoding='utf-8') as f:
f.write(md_content)
print(f"Successfully converted {input_file} to {output_file}")
except Exception as e:
print(f"Error: {str(e)}")
sys.exit(1)
if __name__ == "__main__":
main()