-
Notifications
You must be signed in to change notification settings - Fork 49
/
agentic_chunker.py
367 lines (292 loc) · 15.6 KB
/
agentic_chunker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
from langchain_core.prompts import ChatPromptTemplate
import uuid
from langchain.chat_models import ChatOpenAI
import os
from typing import Optional
from langchain_core.pydantic_v1 import BaseModel
from langchain.chains import create_extraction_chain_pydantic
from dotenv import load_dotenv
load_dotenv()
class AgenticChunker:
def __init__(self, openai_api_key=None):
self.chunks = {}
self.id_truncate_limit = 5
# Whether or not to update/refine summaries and titles as you get new information
self.generate_new_metadata_ind = True
self.print_logging = True
if openai_api_key is None:
openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key is None:
raise ValueError("API key is not provided and not found in environment variables")
self.llm = ChatOpenAI(model='gpt-4-1106-preview', openai_api_key=openai_api_key, temperature=0)
def add_propositions(self, propositions):
for proposition in propositions:
self.add_proposition(proposition)
def add_proposition(self, proposition):
if self.print_logging:
print (f"\nAdding: '{proposition}'")
# If it's your first chunk, just make a new chunk and don't check for others
if len(self.chunks) == 0:
if self.print_logging:
print ("No chunks, creating a new one")
self._create_new_chunk(proposition)
return
chunk_id = self._find_relevant_chunk(proposition)
# If a chunk was found then add the proposition to it
if chunk_id:
if self.print_logging:
print (f"Chunk Found ({self.chunks[chunk_id]['chunk_id']}), adding to: {self.chunks[chunk_id]['title']}")
self.add_proposition_to_chunk(chunk_id, proposition)
return
else:
if self.print_logging:
print ("No chunks found")
# If a chunk wasn't found, then create a new one
self._create_new_chunk(proposition)
def add_proposition_to_chunk(self, chunk_id, proposition):
# Add then
self.chunks[chunk_id]['propositions'].append(proposition)
# Then grab a new summary
if self.generate_new_metadata_ind:
self.chunks[chunk_id]['summary'] = self._update_chunk_summary(self.chunks[chunk_id])
self.chunks[chunk_id]['title'] = self._update_chunk_title(self.chunks[chunk_id])
def _update_chunk_summary(self, chunk):
"""
If you add a new proposition to a chunk, you may want to update the summary or else they could get stale
"""
PROMPT = ChatPromptTemplate.from_messages(
[
(
"system",
"""
You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic
A new proposition was just added to one of your chunks, you should generate a very brief 1-sentence summary which will inform viewers what a chunk group is about.
A good summary will say what the chunk is about, and give any clarifying instructions on what to add to the chunk.
You will be given a group of propositions which are in the chunk and the chunks current summary.
Your summaries should anticipate generalization. If you get a proposition about apples, generalize it to food.
Or month, generalize it to "date and times".
Example:
Input: Proposition: Greg likes to eat pizza
Output: This chunk contains information about the types of food Greg likes to eat.
Only respond with the chunk new summary, nothing else.
""",
),
("user", "Chunk's propositions:\n{proposition}\n\nCurrent chunk summary:\n{current_summary}"),
]
)
runnable = PROMPT | self.llm
new_chunk_summary = runnable.invoke({
"proposition": "\n".join(chunk['propositions']),
"current_summary" : chunk['summary']
}).content
return new_chunk_summary
def _update_chunk_title(self, chunk):
"""
If you add a new proposition to a chunk, you may want to update the title or else it can get stale
"""
PROMPT = ChatPromptTemplate.from_messages(
[
(
"system",
"""
You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic
A new proposition was just added to one of your chunks, you should generate a very brief updated chunk title which will inform viewers what a chunk group is about.
A good title will say what the chunk is about.
You will be given a group of propositions which are in the chunk, chunk summary and the chunk title.
Your title should anticipate generalization. If you get a proposition about apples, generalize it to food.
Or month, generalize it to "date and times".
Example:
Input: Summary: This chunk is about dates and times that the author talks about
Output: Date & Times
Only respond with the new chunk title, nothing else.
""",
),
("user", "Chunk's propositions:\n{proposition}\n\nChunk summary:\n{current_summary}\n\nCurrent chunk title:\n{current_title}"),
]
)
runnable = PROMPT | self.llm
updated_chunk_title = runnable.invoke({
"proposition": "\n".join(chunk['propositions']),
"current_summary" : chunk['summary'],
"current_title" : chunk['title']
}).content
return updated_chunk_title
def _get_new_chunk_summary(self, proposition):
PROMPT = ChatPromptTemplate.from_messages(
[
(
"system",
"""
You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic
You should generate a very brief 1-sentence summary which will inform viewers what a chunk group is about.
A good summary will say what the chunk is about, and give any clarifying instructions on what to add to the chunk.
You will be given a proposition which will go into a new chunk. This new chunk needs a summary.
Your summaries should anticipate generalization. If you get a proposition about apples, generalize it to food.
Or month, generalize it to "date and times".
Example:
Input: Proposition: Greg likes to eat pizza
Output: This chunk contains information about the types of food Greg likes to eat.
Only respond with the new chunk summary, nothing else.
""",
),
("user", "Determine the summary of the new chunk that this proposition will go into:\n{proposition}"),
]
)
runnable = PROMPT | self.llm
new_chunk_summary = runnable.invoke({
"proposition": proposition
}).content
return new_chunk_summary
def _get_new_chunk_title(self, summary):
PROMPT = ChatPromptTemplate.from_messages(
[
(
"system",
"""
You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic
You should generate a very brief few word chunk title which will inform viewers what a chunk group is about.
A good chunk title is brief but encompasses what the chunk is about
You will be given a summary of a chunk which needs a title
Your titles should anticipate generalization. If you get a proposition about apples, generalize it to food.
Or month, generalize it to "date and times".
Example:
Input: Summary: This chunk is about dates and times that the author talks about
Output: Date & Times
Only respond with the new chunk title, nothing else.
""",
),
("user", "Determine the title of the chunk that this summary belongs to:\n{summary}"),
]
)
runnable = PROMPT | self.llm
new_chunk_title = runnable.invoke({
"summary": summary
}).content
return new_chunk_title
def _create_new_chunk(self, proposition):
new_chunk_id = str(uuid.uuid4())[:self.id_truncate_limit] # I don't want long ids
new_chunk_summary = self._get_new_chunk_summary(proposition)
new_chunk_title = self._get_new_chunk_title(new_chunk_summary)
self.chunks[new_chunk_id] = {
'chunk_id' : new_chunk_id,
'propositions': [proposition],
'title' : new_chunk_title,
'summary': new_chunk_summary,
'chunk_index' : len(self.chunks)
}
if self.print_logging:
print (f"Created new chunk ({new_chunk_id}): {new_chunk_title}")
def get_chunk_outline(self):
"""
Get a string which represents the chunks you currently have.
This will be empty when you first start off
"""
chunk_outline = ""
for chunk_id, chunk in self.chunks.items():
single_chunk_string = f"""Chunk ID: {chunk['chunk_id']}\nChunk Name: {chunk['title']}\nChunk Summary: {chunk['summary']}\n\n"""
chunk_outline += single_chunk_string
return chunk_outline
def _find_relevant_chunk(self, proposition):
current_chunk_outline = self.get_chunk_outline()
PROMPT = ChatPromptTemplate.from_messages(
[
(
"system",
"""
Determine whether or not the "Proposition" should belong to any of the existing chunks.
A proposition should belong to a chunk of their meaning, direction, or intention are similar.
The goal is to group similar propositions and chunks.
If you think a proposition should be joined with a chunk, return the chunk id.
If you do not think an item should be joined with an existing chunk, just return "No chunks"
Example:
Input:
- Proposition: "Greg really likes hamburgers"
- Current Chunks:
- Chunk ID: 2n4l3d
- Chunk Name: Places in San Francisco
- Chunk Summary: Overview of the things to do with San Francisco Places
- Chunk ID: 93833k
- Chunk Name: Food Greg likes
- Chunk Summary: Lists of the food and dishes that Greg likes
Output: 93833k
""",
),
("user", "Current Chunks:\n--Start of current chunks--\n{current_chunk_outline}\n--End of current chunks--"),
("user", "Determine if the following statement should belong to one of the chunks outlined:\n{proposition}"),
]
)
runnable = PROMPT | self.llm
chunk_found = runnable.invoke({
"proposition": proposition,
"current_chunk_outline": current_chunk_outline
}).content
# Pydantic data class
class ChunkID(BaseModel):
"""Extracting the chunk id"""
chunk_id: Optional[str]
# Extraction to catch-all LLM responses. This is a bandaid
extraction_chain = create_extraction_chain_pydantic(pydantic_schema=ChunkID, llm=self.llm)
extraction_found = extraction_chain.run(chunk_found)
if extraction_found:
chunk_found = extraction_found[0].chunk_id
# If you got a response that isn't the chunk id limit, chances are it's a bad response or it found nothing
# So return nothing
if len(chunk_found) != self.id_truncate_limit:
return None
return chunk_found
def get_chunks(self, get_type='dict'):
"""
This function returns the chunks in the format specified by the 'get_type' parameter.
If 'get_type' is 'dict', it returns the chunks as a dictionary.
If 'get_type' is 'list_of_strings', it returns the chunks as a list of strings, where each string is a proposition in the chunk.
"""
if get_type == 'dict':
return self.chunks
if get_type == 'list_of_strings':
chunks = []
for chunk_id, chunk in self.chunks.items():
chunks.append(" ".join([x for x in chunk['propositions']]))
return chunks
def pretty_print_chunks(self):
print (f"\nYou have {len(self.chunks)} chunks\n")
for chunk_id, chunk in self.chunks.items():
print(f"Chunk #{chunk['chunk_index']}")
print(f"Chunk ID: {chunk_id}")
print(f"Summary: {chunk['summary']}")
print(f"Propositions:")
for prop in chunk['propositions']:
print(f" -{prop}")
print("\n\n")
def pretty_print_chunk_outline(self):
print ("Chunk Outline\n")
print(self.get_chunk_outline())
if __name__ == "__main__":
ac = AgenticChunker()
## Comment and uncomment the propositions to your hearts content
propositions = [
'The month is October.',
'The year is 2023.',
"One of the most important things that I didn't understand about the world as a child was the degree to which the returns for performance are superlinear.",
'Teachers and coaches implicitly told us that the returns were linear.',
"I heard a thousand times that 'You get out what you put in.'",
# 'Teachers and coaches meant well.',
# "The statement that 'You get out what you put in' is rarely true.",
# "If your product is only half as good as your competitor's product, you do not get half as many customers.",
# "You get no customers if your product is only half as good as your competitor's product.",
# 'You go out of business if you get no customers.',
# 'The returns for performance are superlinear in business.',
# 'Some people think the superlinear returns for performance are a flaw of capitalism.',
# 'Some people think that changing the rules of capitalism would stop the superlinear returns for performance from being true.',
# 'Superlinear returns for performance are a feature of the world.',
# 'Superlinear returns for performance are not an artifact of rules that humans have invented.',
# 'The same pattern of superlinear returns is observed in fame.',
# 'The same pattern of superlinear returns is observed in power.',
# 'The same pattern of superlinear returns is observed in military victories.',
# 'The same pattern of superlinear returns is observed in knowledge.',
# 'The same pattern of superlinear returns is observed in benefit to humanity.',
# 'In fame, power, military victories, knowledge, and benefit to humanity, the rich get richer.'
]
ac.add_propositions(propositions)
ac.pretty_print_chunks()
ac.pretty_print_chunk_outline()
print (ac.get_chunks(get_type='list_of_strings'))