@@ -102,7 +102,10 @@ interface EmbeddingResponse {
102
102
[ key : string ] : any ;
103
103
}
104
104
105
- async function embedWithRetry ( texts : string [ ] , retryCount = 0 ) : Promise < EmbeddingResponse > {
105
+ async function embedWithRetry (
106
+ texts : string [ ] ,
107
+ retryCount = 0
108
+ ) : Promise < EmbeddingResponse > {
106
109
try {
107
110
// Set a timeout promise
108
111
const timeoutPromise = new Promise < never > ( ( _ , reject ) => {
@@ -116,7 +119,7 @@ async function embedWithRetry(texts: string[], retryCount = 0): Promise<Embeddin
116
119
input : texts ,
117
120
inputType : 'document' ,
118
121
} ) as Promise < EmbeddingResponse > ,
119
- timeoutPromise
122
+ timeoutPromise ,
120
123
] ) ;
121
124
} catch ( error : any ) {
122
125
// Check if we still have retries left
@@ -131,26 +134,31 @@ async function embedWithRetry(texts: string[], retryCount = 0): Promise<Embeddin
131
134
) ;
132
135
await wait ( delay ) ;
133
136
return embedWithRetry ( texts , retryCount + 1 ) ;
134
- }
137
+ }
135
138
// Handle timeouts and other transient errors
136
- else if ( error . message === 'Request timed out' ||
137
- error . message ?. includes ( 'timeout' ) ||
138
- error . message ?. includes ( 'network' ) ||
139
- error . code === 'ECONNRESET' ||
140
- error . code === 'ETIMEDOUT' ) {
139
+ else if (
140
+ error . message === 'Request timed out' ||
141
+ error . message ?. includes ( 'timeout' ) ||
142
+ error . message ?. includes ( 'network' ) ||
143
+ error . code === 'ECONNRESET' ||
144
+ error . code === 'ETIMEDOUT'
145
+ ) {
141
146
const delay = INITIAL_RETRY_DELAY * Math . pow ( 2 , retryCount ) ;
142
147
console . log (
143
- `Request failed with error: ${ error . message } . Waiting ${ delay } ms before retry ${
144
- retryCount + 1
145
- } /${ MAX_RETRIES } `
148
+ `Request failed with error: ${
149
+ error . message
150
+ } . Waiting ${ delay } ms before retry ${ retryCount + 1 } /${ MAX_RETRIES } `
146
151
) ;
147
152
await wait ( delay ) ;
148
153
return embedWithRetry ( texts , retryCount + 1 ) ;
149
154
}
150
155
}
151
-
156
+
152
157
// No more retries or non-retriable error
153
- console . error ( `Embedding failed after ${ retryCount } retries:` , error . message ) ;
158
+ console . error (
159
+ `Embedding failed after ${ retryCount } retries:` ,
160
+ error . message
161
+ ) ;
154
162
throw error ;
155
163
}
156
164
}
@@ -165,49 +173,88 @@ function extractPostDate(filePath: string, frontmatter: any): Date {
165
173
// Approach 1: Direct Date constructor (handles ISO formats and many common formats)
166
174
const parsedDate = new Date ( frontmatter . date ) ;
167
175
if ( ! isNaN ( parsedDate . getTime ( ) ) ) {
168
- console . log ( `Date from frontmatter (direct): ${ parsedDate . toISOString ( ) } for ${ filePath } ` ) ;
176
+ console . log (
177
+ `Date from frontmatter (direct): ${ parsedDate . toISOString ( ) } for ${ filePath } `
178
+ ) ;
169
179
return parsedDate ;
170
180
}
171
-
181
+
172
182
// Approach 2: Handle month name formats like "Jun 1, 2024" or "June 1 2024"
173
- const monthNameMatch = String ( frontmatter . date ) . match ( / ( [ A - Z a - z ] + ) \s + ( \d { 1 , 2 } ) (?: , ? \s + ) ? ( \d { 4 } ) / ) ;
183
+ const monthNameMatch = String ( frontmatter . date ) . match (
184
+ / ( [ A - Z a - z ] + ) \s + ( \d { 1 , 2 } ) (?: , ? \s + ) ? ( \d { 4 } ) /
185
+ ) ;
174
186
if ( monthNameMatch ) {
175
187
const [ _ , month , day , year ] = monthNameMatch ;
176
- const monthMap : { [ key : string ] : number } = {
177
- jan : 0 , january : 0 , feb : 1 , february : 1 , mar : 2 , march : 2 ,
178
- apr : 3 , april : 3 , may : 4 , jun : 5 , june : 5 , jul : 6 , july : 6 ,
179
- aug : 7 , august : 7 , sep : 8 , september : 8 , oct : 9 , october : 9 ,
180
- nov : 10 , november : 10 , dec : 11 , december : 11
188
+ const monthMap : { [ key : string ] : number } = {
189
+ jan : 0 ,
190
+ january : 0 ,
191
+ feb : 1 ,
192
+ february : 1 ,
193
+ mar : 2 ,
194
+ march : 2 ,
195
+ apr : 3 ,
196
+ april : 3 ,
197
+ may : 4 ,
198
+ jun : 5 ,
199
+ june : 5 ,
200
+ jul : 6 ,
201
+ july : 6 ,
202
+ aug : 7 ,
203
+ august : 7 ,
204
+ sep : 8 ,
205
+ september : 8 ,
206
+ oct : 9 ,
207
+ october : 9 ,
208
+ nov : 10 ,
209
+ november : 10 ,
210
+ dec : 11 ,
211
+ december : 11 ,
181
212
} ;
182
-
213
+
183
214
const monthIndex = monthMap [ month . toLowerCase ( ) ] ;
184
215
if ( monthIndex !== undefined ) {
185
- const formattedDate = new Date ( parseInt ( year ) , monthIndex , parseInt ( day ) ) ;
216
+ const formattedDate = new Date (
217
+ parseInt ( year ) ,
218
+ monthIndex ,
219
+ parseInt ( day )
220
+ ) ;
186
221
if ( ! isNaN ( formattedDate . getTime ( ) ) ) {
187
- console . log ( `Date from frontmatter (month name): ${ formattedDate . toISOString ( ) } for ${ filePath } ` ) ;
222
+ console . log (
223
+ `Date from frontmatter (month name): ${ formattedDate . toISOString ( ) } for ${ filePath } `
224
+ ) ;
188
225
return formattedDate ;
189
226
}
190
227
}
191
228
}
192
-
229
+
193
230
// Log warning if we have a date field but couldn't parse it
194
- console . warn ( `Warning: Could not parse date '${ frontmatter . date } ' from frontmatter in ${ filePath } ` ) ;
231
+ console . warn (
232
+ `Warning: Could not parse date '${ frontmatter . date } ' from frontmatter in ${ filePath } `
233
+ ) ;
195
234
}
196
235
197
236
// Try to parse from filename as fallback (e.g., MMDDYY.md format)
198
237
const filenameMatch = filePath . match ( / ( \d { 2 } ) ( \d { 2 } ) ( \d { 2 } ) \. m d $ / ) ;
199
238
if ( filenameMatch ) {
200
239
const [ _ , month , day , year ] = filenameMatch ;
201
240
const fullYear = parseInt ( `20${ year } ` ) ; // Assuming 20xx years
202
- const dateFromFilename = new Date ( fullYear , parseInt ( month ) - 1 , parseInt ( day ) ) ;
203
- console . log ( `Date from filename: ${ dateFromFilename . toISOString ( ) } for ${ filePath } ` ) ;
241
+ const dateFromFilename = new Date (
242
+ fullYear ,
243
+ parseInt ( month ) - 1 ,
244
+ parseInt ( day )
245
+ ) ;
246
+ console . log (
247
+ `Date from filename: ${ dateFromFilename . toISOString ( ) } for ${ filePath } `
248
+ ) ;
204
249
return dateFromFilename ;
205
250
}
206
251
207
252
// Use a stable default date for posts with no date instead of current date
208
253
// Using January 1, 2020 as a reasonable default that will still sort correctly
209
254
const defaultDate = new Date ( 2020 , 0 , 1 ) ;
210
- console . warn ( `Warning: No date found for ${ filePath } , using default date ${ defaultDate . toISOString ( ) } ` ) ;
255
+ console . warn (
256
+ `Warning: No date found for ${ filePath } , using default date ${ defaultDate . toISOString ( ) } `
257
+ ) ;
211
258
return defaultDate ;
212
259
}
213
260
@@ -240,7 +287,7 @@ async function generateEmbeddingsForSingleFile(
240
287
const { frontmatter, chunks } = post ;
241
288
let successfulChunks = 0 ;
242
289
let failedChunks = 0 ;
243
-
290
+
244
291
// Create a chunk-level progress bar
245
292
function updateChunkProgress ( ) {
246
293
const total = chunks . length ;
@@ -341,7 +388,7 @@ async function generateEmbeddingsForSingleFile(
341
388
} catch ( error ) {
342
389
console . error ( 'Error inserting whole post chunk:' , error ) ;
343
390
failedChunks ++ ;
344
-
391
+
345
392
// Update chunk progress after error
346
393
console . log ( updateChunkProgress ( ) ) ;
347
394
}
@@ -358,7 +405,11 @@ async function generateEmbeddingsForSingleFile(
358
405
const batchChunks = chunks . slice ( i , i + BATCH_SIZE ) ;
359
406
const batchEnd = Math . min ( i + BATCH_SIZE , chunks . length ) ;
360
407
361
- console . log ( `\nProcessing batch ${ i } -${ batchEnd } of ${ chunks . length } (${ Math . ceil ( ( batchEnd - i ) / BATCH_SIZE ) } /${ Math . ceil ( chunks . length / BATCH_SIZE ) } batches)` ) ;
408
+ console . log (
409
+ `\nProcessing batch ${ i } -${ batchEnd } of ${ chunks . length } (${ Math . ceil (
410
+ ( batchEnd - i ) / BATCH_SIZE
411
+ ) } /${ Math . ceil ( chunks . length / BATCH_SIZE ) } batches)`
412
+ ) ;
362
413
363
414
try {
364
415
// Format chunks with more context
@@ -383,38 +434,43 @@ async function generateEmbeddingsForSingleFile(
383
434
message ?: string ;
384
435
code ?: string ;
385
436
} ;
386
-
387
- if ( inputTexts . length > 3 && (
388
- error . message ?. includes ( 'timeout' ) ||
389
- error . message ?. includes ( 'network' ) ||
390
- error . code === 'ECONNRESET' ||
391
- error . code === 'ETIMEDOUT' ) ) {
392
-
437
+
438
+ if (
439
+ inputTexts . length > 3 &&
440
+ ( error . message ?. includes ( 'timeout' ) ||
441
+ error . message ?. includes ( 'network' ) ||
442
+ error . code === 'ECONNRESET' ||
443
+ error . code === 'ETIMEDOUT' )
444
+ ) {
393
445
console . log ( `Error processing full batch: ${ error . message } ` ) ;
394
446
console . log ( `Splitting batch into smaller chunks and retrying...` ) ;
395
-
447
+
396
448
// Split the batch in half
397
449
const midpoint = Math . floor ( inputTexts . length / 2 ) ;
398
450
const firstHalf = inputTexts . slice ( 0 , midpoint ) ;
399
451
const secondHalf = inputTexts . slice ( midpoint ) ;
400
-
452
+
401
453
// Process first half
402
454
console . log ( `Processing first half (${ firstHalf . length } chunks)...` ) ;
403
455
const firstResponse = await embedWithRetry ( firstHalf ) ;
404
-
456
+
405
457
// Add delay between sub-batches
406
458
await wait ( DELAY_BETWEEN_BATCHES ) ;
407
-
459
+
408
460
// Process second half
409
- console . log ( `Processing second half (${ secondHalf . length } chunks)...` ) ;
461
+ console . log (
462
+ `Processing second half (${ secondHalf . length } chunks)...`
463
+ ) ;
410
464
const secondResponse = await embedWithRetry ( secondHalf ) ;
411
-
465
+
412
466
// Merge responses
413
467
response = {
414
- data : [ ...firstResponse . data , ...secondResponse . data ]
468
+ data : [ ...firstResponse . data , ...secondResponse . data ] ,
415
469
} ;
416
-
417
- console . log ( `Successfully processed split batch with ${ response . data . length } embeddings` ) ;
470
+
471
+ console . log (
472
+ `Successfully processed split batch with ${ response . data . length } embeddings`
473
+ ) ;
418
474
} else {
419
475
// If not a timeout or the batch is already small, rethrow
420
476
throw embeddingError ;
@@ -552,6 +608,10 @@ async function generateEmbeddingsForSingleFile(
552
608
const results = await Promise . all ( insertPromises ) ;
553
609
const successCount = results . filter ( Boolean ) . length ;
554
610
611
+ // Update counters based on results
612
+ successfulChunks += successCount ;
613
+ failedChunks += batchChunks . length - successCount ;
614
+
555
615
// Process overlaps after all inserts completed
556
616
if ( successCount > 1 ) {
557
617
// Only process overlaps if we have at least 2 chunks
@@ -561,7 +621,7 @@ async function generateEmbeddingsForSingleFile(
561
621
562
622
// Update chunk progress
563
623
console . log ( updateChunkProgress ( ) ) ;
564
-
624
+
565
625
console . log (
566
626
`✅ Batch complete: ${ successCount } /${ batchChunks . length } chunks successful with sliding window overlaps`
567
627
) ;
@@ -571,22 +631,26 @@ async function generateEmbeddingsForSingleFile(
571
631
} catch ( error ) {
572
632
console . error ( 'Error processing batch:' , error ) ;
573
633
failedChunks += batchChunks . length ;
574
-
634
+
575
635
// Update chunk progress after error
576
636
console . log ( updateChunkProgress ( ) ) ;
577
637
}
578
638
}
579
639
580
640
return { successfulChunks, failedChunks } ;
581
641
}
582
-
583
642
/**
584
643
* Creates a simple ASCII progress bar
585
644
*/
586
- function createProgressBar ( current : number , total : number , width : number = 30 ) : string {
645
+ function createProgressBar (
646
+ current : number ,
647
+ total : number ,
648
+ width : number = 30
649
+ ) : string {
587
650
const percentage = Math . round ( ( current / total ) * 100 ) ;
588
651
const progressChars = Math . round ( ( current / total ) * width ) ;
589
- const progressBar = '█' . repeat ( progressChars ) + '░' . repeat ( width - progressChars ) ;
652
+ const progressBar =
653
+ '█' . repeat ( progressChars ) + '░' . repeat ( width - progressChars ) ;
590
654
return `[${ progressBar } ] ${ percentage } % (${ current } /${ total } )` ;
591
655
}
592
656
@@ -605,7 +669,9 @@ async function generateEmbeddingsForAllFiles() {
605
669
606
670
for ( let i = 0 ; i < nonDraftPosts . length ; i ++ ) {
607
671
const post = nonDraftPosts [ i ] ;
608
- console . log ( `\n=== Processing file ${ i + 1 } /${ totalFiles } : ${ post . filePath } ===` ) ;
672
+ console . log (
673
+ `\n=== Processing file ${ i + 1 } /${ totalFiles } : ${ post . filePath } ===`
674
+ ) ;
609
675
610
676
const { successfulChunks, failedChunks } =
611
677
await generateEmbeddingsForSingleFile ( post . filePath ) ;
@@ -614,8 +680,8 @@ async function generateEmbeddingsForAllFiles() {
614
680
totalFailed += failedChunks ;
615
681
616
682
// Update progress bar
617
- console . log ( createProgressBar ( i + 1 , totalFiles ) ) ;
618
-
683
+ console . log ( createProgressBar ( i + 1 , totalFiles ) ) ;
684
+
619
685
// Add delay between files
620
686
if ( i < nonDraftPosts . length - 1 ) {
621
687
console . log ( `Waiting ${ DELAY_BETWEEN_FILES } ms before next file...` ) ;
0 commit comments