Skip to content

Commit 1a1534b

Browse files
committed
fix embeddings summary stats not counting
1 parent 7aaf40b commit 1a1534b

File tree

1 file changed

+122
-56
lines changed

1 file changed

+122
-56
lines changed

scripts/generateEmbeddings.ts

Lines changed: 122 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,10 @@ interface EmbeddingResponse {
102102
[key: string]: any;
103103
}
104104

105-
async function embedWithRetry(texts: string[], retryCount = 0): Promise<EmbeddingResponse> {
105+
async function embedWithRetry(
106+
texts: string[],
107+
retryCount = 0
108+
): Promise<EmbeddingResponse> {
106109
try {
107110
// Set a timeout promise
108111
const timeoutPromise = new Promise<never>((_, reject) => {
@@ -116,7 +119,7 @@ async function embedWithRetry(texts: string[], retryCount = 0): Promise<Embeddin
116119
input: texts,
117120
inputType: 'document',
118121
}) as Promise<EmbeddingResponse>,
119-
timeoutPromise
122+
timeoutPromise,
120123
]);
121124
} catch (error: any) {
122125
// Check if we still have retries left
@@ -131,26 +134,31 @@ async function embedWithRetry(texts: string[], retryCount = 0): Promise<Embeddin
131134
);
132135
await wait(delay);
133136
return embedWithRetry(texts, retryCount + 1);
134-
}
137+
}
135138
// Handle timeouts and other transient errors
136-
else if (error.message === 'Request timed out' ||
137-
error.message?.includes('timeout') ||
138-
error.message?.includes('network') ||
139-
error.code === 'ECONNRESET' ||
140-
error.code === 'ETIMEDOUT') {
139+
else if (
140+
error.message === 'Request timed out' ||
141+
error.message?.includes('timeout') ||
142+
error.message?.includes('network') ||
143+
error.code === 'ECONNRESET' ||
144+
error.code === 'ETIMEDOUT'
145+
) {
141146
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retryCount);
142147
console.log(
143-
`Request failed with error: ${error.message}. Waiting ${delay}ms before retry ${
144-
retryCount + 1
145-
}/${MAX_RETRIES}`
148+
`Request failed with error: ${
149+
error.message
150+
}. Waiting ${delay}ms before retry ${retryCount + 1}/${MAX_RETRIES}`
146151
);
147152
await wait(delay);
148153
return embedWithRetry(texts, retryCount + 1);
149154
}
150155
}
151-
156+
152157
// No more retries or non-retriable error
153-
console.error(`Embedding failed after ${retryCount} retries:`, error.message);
158+
console.error(
159+
`Embedding failed after ${retryCount} retries:`,
160+
error.message
161+
);
154162
throw error;
155163
}
156164
}
@@ -165,49 +173,88 @@ function extractPostDate(filePath: string, frontmatter: any): Date {
165173
// Approach 1: Direct Date constructor (handles ISO formats and many common formats)
166174
const parsedDate = new Date(frontmatter.date);
167175
if (!isNaN(parsedDate.getTime())) {
168-
console.log(`Date from frontmatter (direct): ${parsedDate.toISOString()} for ${filePath}`);
176+
console.log(
177+
`Date from frontmatter (direct): ${parsedDate.toISOString()} for ${filePath}`
178+
);
169179
return parsedDate;
170180
}
171-
181+
172182
// Approach 2: Handle month name formats like "Jun 1, 2024" or "June 1 2024"
173-
const monthNameMatch = String(frontmatter.date).match(/([A-Za-z]+)\s+(\d{1,2})(?:,?\s+)?(\d{4})/);
183+
const monthNameMatch = String(frontmatter.date).match(
184+
/([A-Za-z]+)\s+(\d{1,2})(?:,?\s+)?(\d{4})/
185+
);
174186
if (monthNameMatch) {
175187
const [_, month, day, year] = monthNameMatch;
176-
const monthMap: {[key: string]: number} = {
177-
jan: 0, january: 0, feb: 1, february: 1, mar: 2, march: 2,
178-
apr: 3, april: 3, may: 4, jun: 5, june: 5, jul: 6, july: 6,
179-
aug: 7, august: 7, sep: 8, september: 8, oct: 9, october: 9,
180-
nov: 10, november: 10, dec: 11, december: 11
188+
const monthMap: { [key: string]: number } = {
189+
jan: 0,
190+
january: 0,
191+
feb: 1,
192+
february: 1,
193+
mar: 2,
194+
march: 2,
195+
apr: 3,
196+
april: 3,
197+
may: 4,
198+
jun: 5,
199+
june: 5,
200+
jul: 6,
201+
july: 6,
202+
aug: 7,
203+
august: 7,
204+
sep: 8,
205+
september: 8,
206+
oct: 9,
207+
october: 9,
208+
nov: 10,
209+
november: 10,
210+
dec: 11,
211+
december: 11,
181212
};
182-
213+
183214
const monthIndex = monthMap[month.toLowerCase()];
184215
if (monthIndex !== undefined) {
185-
const formattedDate = new Date(parseInt(year), monthIndex, parseInt(day));
216+
const formattedDate = new Date(
217+
parseInt(year),
218+
monthIndex,
219+
parseInt(day)
220+
);
186221
if (!isNaN(formattedDate.getTime())) {
187-
console.log(`Date from frontmatter (month name): ${formattedDate.toISOString()} for ${filePath}`);
222+
console.log(
223+
`Date from frontmatter (month name): ${formattedDate.toISOString()} for ${filePath}`
224+
);
188225
return formattedDate;
189226
}
190227
}
191228
}
192-
229+
193230
// Log warning if we have a date field but couldn't parse it
194-
console.warn(`Warning: Could not parse date '${frontmatter.date}' from frontmatter in ${filePath}`);
231+
console.warn(
232+
`Warning: Could not parse date '${frontmatter.date}' from frontmatter in ${filePath}`
233+
);
195234
}
196235

197236
// Try to parse from filename as fallback (e.g., MMDDYY.md format)
198237
const filenameMatch = filePath.match(/(\d{2})(\d{2})(\d{2})\.md$/);
199238
if (filenameMatch) {
200239
const [_, month, day, year] = filenameMatch;
201240
const fullYear = parseInt(`20${year}`); // Assuming 20xx years
202-
const dateFromFilename = new Date(fullYear, parseInt(month) - 1, parseInt(day));
203-
console.log(`Date from filename: ${dateFromFilename.toISOString()} for ${filePath}`);
241+
const dateFromFilename = new Date(
242+
fullYear,
243+
parseInt(month) - 1,
244+
parseInt(day)
245+
);
246+
console.log(
247+
`Date from filename: ${dateFromFilename.toISOString()} for ${filePath}`
248+
);
204249
return dateFromFilename;
205250
}
206251

207252
// Use a stable default date for posts with no date instead of current date
208253
// Using January 1, 2020 as a reasonable default that will still sort correctly
209254
const defaultDate = new Date(2020, 0, 1);
210-
console.warn(`Warning: No date found for ${filePath}, using default date ${defaultDate.toISOString()}`);
255+
console.warn(
256+
`Warning: No date found for ${filePath}, using default date ${defaultDate.toISOString()}`
257+
);
211258
return defaultDate;
212259
}
213260

@@ -240,7 +287,7 @@ async function generateEmbeddingsForSingleFile(
240287
const { frontmatter, chunks } = post;
241288
let successfulChunks = 0;
242289
let failedChunks = 0;
243-
290+
244291
// Create a chunk-level progress bar
245292
function updateChunkProgress() {
246293
const total = chunks.length;
@@ -341,7 +388,7 @@ async function generateEmbeddingsForSingleFile(
341388
} catch (error) {
342389
console.error('Error inserting whole post chunk:', error);
343390
failedChunks++;
344-
391+
345392
// Update chunk progress after error
346393
console.log(updateChunkProgress());
347394
}
@@ -358,7 +405,11 @@ async function generateEmbeddingsForSingleFile(
358405
const batchChunks = chunks.slice(i, i + BATCH_SIZE);
359406
const batchEnd = Math.min(i + BATCH_SIZE, chunks.length);
360407

361-
console.log(`\nProcessing batch ${i}-${batchEnd} of ${chunks.length} (${Math.ceil((batchEnd - i) / BATCH_SIZE)}/${Math.ceil(chunks.length / BATCH_SIZE)} batches)`);
408+
console.log(
409+
`\nProcessing batch ${i}-${batchEnd} of ${chunks.length} (${Math.ceil(
410+
(batchEnd - i) / BATCH_SIZE
411+
)}/${Math.ceil(chunks.length / BATCH_SIZE)} batches)`
412+
);
362413

363414
try {
364415
// Format chunks with more context
@@ -383,38 +434,43 @@ async function generateEmbeddingsForSingleFile(
383434
message?: string;
384435
code?: string;
385436
};
386-
387-
if (inputTexts.length > 3 && (
388-
error.message?.includes('timeout') ||
389-
error.message?.includes('network') ||
390-
error.code === 'ECONNRESET' ||
391-
error.code === 'ETIMEDOUT')) {
392-
437+
438+
if (
439+
inputTexts.length > 3 &&
440+
(error.message?.includes('timeout') ||
441+
error.message?.includes('network') ||
442+
error.code === 'ECONNRESET' ||
443+
error.code === 'ETIMEDOUT')
444+
) {
393445
console.log(`Error processing full batch: ${error.message}`);
394446
console.log(`Splitting batch into smaller chunks and retrying...`);
395-
447+
396448
// Split the batch in half
397449
const midpoint = Math.floor(inputTexts.length / 2);
398450
const firstHalf = inputTexts.slice(0, midpoint);
399451
const secondHalf = inputTexts.slice(midpoint);
400-
452+
401453
// Process first half
402454
console.log(`Processing first half (${firstHalf.length} chunks)...`);
403455
const firstResponse = await embedWithRetry(firstHalf);
404-
456+
405457
// Add delay between sub-batches
406458
await wait(DELAY_BETWEEN_BATCHES);
407-
459+
408460
// Process second half
409-
console.log(`Processing second half (${secondHalf.length} chunks)...`);
461+
console.log(
462+
`Processing second half (${secondHalf.length} chunks)...`
463+
);
410464
const secondResponse = await embedWithRetry(secondHalf);
411-
465+
412466
// Merge responses
413467
response = {
414-
data: [...firstResponse.data, ...secondResponse.data]
468+
data: [...firstResponse.data, ...secondResponse.data],
415469
};
416-
417-
console.log(`Successfully processed split batch with ${response.data.length} embeddings`);
470+
471+
console.log(
472+
`Successfully processed split batch with ${response.data.length} embeddings`
473+
);
418474
} else {
419475
// If not a timeout or the batch is already small, rethrow
420476
throw embeddingError;
@@ -552,6 +608,10 @@ async function generateEmbeddingsForSingleFile(
552608
const results = await Promise.all(insertPromises);
553609
const successCount = results.filter(Boolean).length;
554610

611+
// Update counters based on results
612+
successfulChunks += successCount;
613+
failedChunks += batchChunks.length - successCount;
614+
555615
// Process overlaps after all inserts completed
556616
if (successCount > 1) {
557617
// Only process overlaps if we have at least 2 chunks
@@ -561,7 +621,7 @@ async function generateEmbeddingsForSingleFile(
561621

562622
// Update chunk progress
563623
console.log(updateChunkProgress());
564-
624+
565625
console.log(
566626
`✅ Batch complete: ${successCount}/${batchChunks.length} chunks successful with sliding window overlaps`
567627
);
@@ -571,22 +631,26 @@ async function generateEmbeddingsForSingleFile(
571631
} catch (error) {
572632
console.error('Error processing batch:', error);
573633
failedChunks += batchChunks.length;
574-
634+
575635
// Update chunk progress after error
576636
console.log(updateChunkProgress());
577637
}
578638
}
579639

580640
return { successfulChunks, failedChunks };
581641
}
582-
583642
/**
584643
* Creates a simple ASCII progress bar
585644
*/
586-
function createProgressBar(current: number, total: number, width: number = 30): string {
645+
function createProgressBar(
646+
current: number,
647+
total: number,
648+
width: number = 30
649+
): string {
587650
const percentage = Math.round((current / total) * 100);
588651
const progressChars = Math.round((current / total) * width);
589-
const progressBar = '█'.repeat(progressChars) + '░'.repeat(width - progressChars);
652+
const progressBar =
653+
'█'.repeat(progressChars) + '░'.repeat(width - progressChars);
590654
return `[${progressBar}] ${percentage}% (${current}/${total})`;
591655
}
592656

@@ -605,7 +669,9 @@ async function generateEmbeddingsForAllFiles() {
605669

606670
for (let i = 0; i < nonDraftPosts.length; i++) {
607671
const post = nonDraftPosts[i];
608-
console.log(`\n=== Processing file ${i+1}/${totalFiles}: ${post.filePath} ===`);
672+
console.log(
673+
`\n=== Processing file ${i + 1}/${totalFiles}: ${post.filePath} ===`
674+
);
609675

610676
const { successfulChunks, failedChunks } =
611677
await generateEmbeddingsForSingleFile(post.filePath);
@@ -614,8 +680,8 @@ async function generateEmbeddingsForAllFiles() {
614680
totalFailed += failedChunks;
615681

616682
// Update progress bar
617-
console.log(createProgressBar(i+1, totalFiles));
618-
683+
console.log(createProgressBar(i + 1, totalFiles));
684+
619685
// Add delay between files
620686
if (i < nonDraftPosts.length - 1) {
621687
console.log(`Waiting ${DELAY_BETWEEN_FILES}ms before next file...`);

0 commit comments

Comments
 (0)