-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathtext_processing.py
More file actions
128 lines (103 loc) · 5.43 KB
/
text_processing.py
File metadata and controls
128 lines (103 loc) · 5.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
Text processing utilities for chunking and parsing responses.
"""
import re
from typing import List, Dict
class TextProcessor:
"""Handles text processing tasks."""
@staticmethod
def split_by_word_count(text: str, words_per_chunk: int) -> List[str]:
"""Split text into chunks based on word count."""
words = text.split()
chunks = []
for i in range(0, len(words), words_per_chunk):
chunk = ' '.join(words[i:i + words_per_chunk])
if chunk.strip():
chunks.append(chunk.strip())
return chunks
@staticmethod
def create_prompt_template(custom_prompt: str, num_questions: int, num_exchanges: int) -> str:
"""Create prompt template for Q&A generation."""
if num_exchanges == 1:
format_instructions = """
Format for each conversation:
CONVERSATION X:
QUESTION: [user question, all lowercase]
ANSWER: [AI response based on text]
"""
else:
format_instructions = f"""
Format for each conversation:
CONVERSATION X:
QUESTION: [initial question from user, all lowercase]
ANSWER: [AI response based on text]
{'FOLLOW-UP: [follow-up question, all lowercase]' * (num_exchanges - 1)}
{'FOLLOW-UP ANSWER: [AI response to follow-up, also based on text]' * (num_exchanges - 1)}
"""
return f"""
{custom_prompt}
Based on the following text, generate {num_questions} conversation pairs with {num_exchanges} exchange(s) each.
Requirements:
- User questions should be natural and use lowercase
- AI responses should be informative and based on the provided text
- Cover the major concepts mentioned in the text
- Each conversation should feel natural and educational
{format_instructions}
Text content:
{{chunk}}
Generate exactly {num_questions} conversations that thoroughly cover the content:
"""
@staticmethod
def parse_qa_response(response_text: str, num_exchanges: int) -> List[Dict]:
"""Parse response into structured conversation pairs."""
conversations = []
# Split by CONVERSATION to find each conversation
parts = response_text.split('CONVERSATION')[1:] # Skip first empty part
for part in parts:
try:
if num_exchanges == 1:
# Parse single exchange conversation
if 'QUESTION:' in part and 'ANSWER:' in part:
sections = part.split('QUESTION:', 1)[1]
if 'ANSWER:' in sections:
question_part, answer_part = sections.split('ANSWER:', 1)
question = question_part.strip()
answer = answer_part.strip()
# Clean up formatting
question = re.sub(r'^[0-9]+\.?\s*', '', question).strip().lower()
answer = re.sub(r'\n\n+', '\n\n', answer).strip()
if question and answer:
conversations.append({
'question': question,
'answer': answer
})
else:
# Parse multi-exchange conversation
if 'QUESTION:' in part and 'ANSWER:' in part:
sections = part.split('QUESTION:', 1)[1]
if 'ANSWER:' in sections:
question_part, rest = sections.split('ANSWER:', 1)
question = question_part.strip()
# Extract first answer
if 'FOLLOW-UP:' in rest:
answer_part, followup_rest = rest.split('FOLLOW-UP:', 1)
answer = answer_part.strip()
if 'FOLLOW-UP ANSWER:' in followup_rest:
followup_question_part, followup_answer_part = followup_rest.split('FOLLOW-UP ANSWER:', 1)
followup_question = followup_question_part.strip()
followup_answer = followup_answer_part.strip()
# Clean up formatting
question = re.sub(r'^[0-9]+\.?\s*', '', question).strip().lower()
answer = re.sub(r'\n\n+', '\n\n', answer).strip()
followup_question = re.sub(r'^[0-9]+\.?\s*', '', followup_question).strip().lower()
followup_answer = re.sub(r'\n\n+', '\n\n', followup_answer).strip()
if question and answer and followup_question and followup_answer:
conversations.append({
'question': question,
'answer': answer,
'followup_question': followup_question,
'followup_answer': followup_answer
})
except Exception as e:
continue
return conversations