diff --git a/src/scraper/fetcher/FileFetcher.ts b/src/scraper/fetcher/FileFetcher.ts index aa8f08b..a1b60d6 100644 --- a/src/scraper/fetcher/FileFetcher.ts +++ b/src/scraper/fetcher/FileFetcher.ts @@ -47,6 +47,8 @@ export class FileFetcher implements ContentFetcher { return "text/markdown"; case ".txt": return "text/plain"; + case ".json": + return "application/json"; default: return "application/octet-stream"; } diff --git a/src/scraper/pipelines/JsonPipeline.test.ts b/src/scraper/pipelines/JsonPipeline.test.ts new file mode 100644 index 0000000..04c4e70 --- /dev/null +++ b/src/scraper/pipelines/JsonPipeline.test.ts @@ -0,0 +1,114 @@ +import { describe, expect, it } from "vitest"; +import type { RawContent } from "../fetcher/types"; +import { JsonPipeline } from "./JsonPipeline"; + +// Helper: pretty-print JSON for easier assertions +function pretty(json: unknown) { + return JSON.stringify(json, null, 2); +} + +// Minimal valid ScraperOptions for tests +const dummyOptions = { + url: "test.json", + library: "test", + version: "1.0", +}; +// Dummy ContentFetcher implementation +const dummyFetcher = { + canFetch: () => false, + fetch: async () => Promise.reject(new Error("Not implemented")), +}; + +describe("JsonPipeline", () => { + it("canProcess returns true for JSON MIME types", () => { + const pipeline = new JsonPipeline(); + const validTypes = [ + "application/json", + "application/ld+json", + "application/vnd.api+json", + "text/json", + "application/json5", + ]; + for (const mimeType of validTypes) { + expect(pipeline.canProcess({ mimeType } as RawContent)).toBe(true); + } + }); + + it("canProcess returns false for non-JSON MIME types", () => { + const pipeline = new JsonPipeline(); + const invalidTypes = [ + "text/html", + "text/plain", + "application/xml", + "text/markdown", + "image/png", + ]; + for (const mimeType of invalidTypes) { + expect(pipeline.canProcess({ mimeType } as RawContent)).toBe(false); + } + }); + + it("splits large JSON arrays into valid JSON chunks", async () => { + const pipeline = new JsonPipeline(); + const arr = Array.from({ length: 100 }, (_, i) => ({ id: i, value: `item${i}` })); + const raw: RawContent = { + content: pretty(arr), + mimeType: "application/json", + source: "test.json", + }; + const result = await pipeline.process(raw, dummyOptions, dummyFetcher); + // Should produce multiple chunks, each valid JSON + const chunks = result.textContent.split("\n"); + for (const chunk of chunks) { + expect(() => JSON.parse(chunk)).not.toThrow(); + } + // Should cover all items + const allItems = chunks.flatMap((chunk) => JSON.parse(chunk)); + expect(allItems.length).toBe(100); + }); + + it("splits large JSON objects into valid JSON chunks", async () => { + const pipeline = new JsonPipeline(); + const obj: Record = {}; + for (let i = 0; i < 100; i++) obj[`key${i}`] = { id: i, value: `item${i}` }; + const raw: RawContent = { + content: pretty(obj), + mimeType: "application/json", + source: "test.json", + }; + const result = await pipeline.process(raw, dummyOptions, dummyFetcher); + const chunks = result.textContent.split("\n"); + for (const chunk of chunks) { + expect(() => JSON.parse(chunk)).not.toThrow(); + } + // Should cover all keys + const allKeys = chunks.flatMap((chunk) => Object.keys(JSON.parse(chunk))); + expect(new Set(allKeys).size).toBe(100); + }); + + it("handles small JSON files as a single chunk", async () => { + const pipeline = new JsonPipeline(); + const data = { foo: 1, bar: [1, 2, 3] }; + const raw: RawContent = { + content: pretty(data), + mimeType: "application/json", + source: "test.json", + }; + const result = await pipeline.process(raw, dummyOptions, dummyFetcher); + // Should be a single chunk + expect(result.textContent.split("\n").length).toBe(1); + expect(() => JSON.parse(result.textContent)).not.toThrow(); + }); + + it("returns metadata with the source as title", async () => { + const pipeline = new JsonPipeline(); + const data = { foo: "bar" }; + const raw: RawContent = { + content: pretty(data), + mimeType: "application/json", + source: "test.json", + }; + const result = await pipeline.process(raw, dummyOptions, dummyFetcher); + expect(result.metadata.title).toBe("test.json"); + }); +}); diff --git a/src/scraper/pipelines/JsonPipeline.ts b/src/scraper/pipelines/JsonPipeline.ts new file mode 100644 index 0000000..9ab9da3 --- /dev/null +++ b/src/scraper/pipelines/JsonPipeline.ts @@ -0,0 +1,39 @@ +import { JsonContentSplitter } from "../../splitter/splitters/JsonContentSplitter"; +import type { RawContent } from "../fetcher/types"; +import type { ContentFetcher } from "../fetcher/types"; +import type { ScraperOptions } from "../types"; +import { BasePipeline } from "./BasePipeline"; +import type { ProcessedContent } from "./types"; + +/** + * Pipeline for processing JSON content using the JsonContentSplitter. + */ +export class JsonPipeline extends BasePipeline { + canProcess(raw: RawContent): boolean { + return ( + typeof raw.mimeType === "string" && + (raw.mimeType === "application/json" || + raw.mimeType === "application/ld+json" || + raw.mimeType.endsWith("+json") || + raw.mimeType === "text/json" || + raw.mimeType === "application/json5") + ); + } + + async process( + raw: RawContent, + _options: ScraperOptions, + _fetcher: ContentFetcher, + ): Promise { + const content = + typeof raw.content === "string" ? raw.content : raw.content.toString("utf-8"); + const splitter = new JsonContentSplitter({ chunkSize: 5000 }); // Use a reasonable default chunk size + const chunks = await splitter.split(content); + return { + textContent: chunks.join("\n"), + metadata: { title: raw.source }, + links: [], // JSON doesn't typically have links + errors: [], + }; + } +} diff --git a/src/scraper/strategies/LocalFileStrategy.test.ts b/src/scraper/strategies/LocalFileStrategy.test.ts index ab19922..5285522 100644 --- a/src/scraper/strategies/LocalFileStrategy.test.ts +++ b/src/scraper/strategies/LocalFileStrategy.test.ts @@ -208,4 +208,35 @@ describe("LocalFileStrategy", () => { }), ); }); + + it("should process .json files using the JsonPipeline", async () => { + const strategy = new LocalFileStrategy(); + const options: ScraperOptions = { + url: "file:///testdir", + library: "test", + version: "1.0", + maxPages: 10, + maxDepth: 1, + maxConcurrency: 1, + }; + const progressCallback = vi.fn(); + const jsonContent = JSON.stringify({ a: 1, b: [2, 3, 4], c: { d: 5 } }); + vol.fromJSON( + { + "/testdir/data.json": jsonContent, + }, + "/", + ); + + await strategy.scrape(options, progressCallback); + expect(progressCallback).toHaveBeenCalledTimes(1); + const call = progressCallback.mock.calls[0][0]; + expect(call.currentUrl).toBe("file:///testdir/data.json"); + // Parse the output and check structure, not formatting + const parsed = JSON.parse(call.document.content); + expect(parsed.a).toBe(1); + expect(parsed.b).toEqual([2, 3, 4]); + expect(parsed.c).toEqual({ d: 5 }); + expect(call.document.metadata.url).toBe("file:///testdir/data.json"); + }); }); diff --git a/src/scraper/strategies/LocalFileStrategy.ts b/src/scraper/strategies/LocalFileStrategy.ts index 635b240..dca6e3c 100644 --- a/src/scraper/strategies/LocalFileStrategy.ts +++ b/src/scraper/strategies/LocalFileStrategy.ts @@ -2,9 +2,11 @@ import fs from "node:fs/promises"; import path from "node:path"; import type { Document, ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; +import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; import { FileFetcher } from "../fetcher"; import type { RawContent } from "../fetcher/types"; import { HtmlPipeline } from "../pipelines/HtmlPipeline"; +import { JsonPipeline } from "../pipelines/JsonPipeline"; import { MarkdownPipeline } from "../pipelines/MarkdownPipeline"; import type { ScraperOptions, ScraperProgress } from "../types"; import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; @@ -13,13 +15,15 @@ export class LocalFileStrategy extends BaseScraperStrategy { private readonly fileFetcher = new FileFetcher(); private readonly htmlPipeline: HtmlPipeline; private readonly markdownPipeline: MarkdownPipeline; - private readonly pipelines: [HtmlPipeline, MarkdownPipeline]; + private readonly jsonPipeline: JsonPipeline; + private readonly pipelines: [HtmlPipeline, MarkdownPipeline, JsonPipeline]; constructor() { super(); this.htmlPipeline = new HtmlPipeline(); this.markdownPipeline = new MarkdownPipeline(); - this.pipelines = [this.htmlPipeline, this.markdownPipeline]; + this.jsonPipeline = new JsonPipeline(); + this.pipelines = [this.htmlPipeline, this.markdownPipeline, this.jsonPipeline]; } canHandle(url: string): boolean { @@ -92,6 +96,7 @@ export class LocalFileStrategy extends BaseScraperStrategy { } finally { await this.htmlPipeline.close(); await this.markdownPipeline.close(); + await this.jsonPipeline.close(); } } } diff --git a/src/scraper/strategies/WebScraperStrategy.test.ts b/src/scraper/strategies/WebScraperStrategy.test.ts index ff5a12c..82d28f2 100644 --- a/src/scraper/strategies/WebScraperStrategy.test.ts +++ b/src/scraper/strategies/WebScraperStrategy.test.ts @@ -492,4 +492,25 @@ describe("WebScraperStrategy", () => { expect(docCall![0].document.content).toContain(expectedMarkdown); expect(docCall![0].document.metadata.title).toBe(expectedTitle); }); + + it("should process .json files using the JsonPipeline", async () => { + const progressCallback = vi.fn(); + const testUrl = "https://example.com/data.json"; + options.url = testUrl; + const jsonContent = JSON.stringify({ foo: [1, 2, 3], bar: { baz: true } }); + mockFetchFn.mockResolvedValue({ + content: jsonContent, + mimeType: "application/json", + source: testUrl, + }); + await strategy.scrape(options, progressCallback); + expect(progressCallback).toHaveBeenCalled(); + const call = progressCallback.mock.calls[0][0]; + expect(call.currentUrl).toBe(testUrl); + // Parse the output and check structure, not formatting + const parsed = JSON.parse(call.document.content); + expect(parsed.foo).toEqual([1, 2, 3]); + expect(parsed.bar).toEqual({ baz: true }); + expect(call.document.metadata.url).toBe(testUrl); + }); }); diff --git a/src/scraper/strategies/WebScraperStrategy.ts b/src/scraper/strategies/WebScraperStrategy.ts index 47bd21a..b86e857 100644 --- a/src/scraper/strategies/WebScraperStrategy.ts +++ b/src/scraper/strategies/WebScraperStrategy.ts @@ -5,6 +5,7 @@ import { hasSameDomain, hasSameHostname, isSubpath } from "../../utils/url"; import { HttpFetcher } from "../fetcher"; import type { RawContent } from "../fetcher/types"; import { HtmlPipeline } from "../pipelines/HtmlPipeline"; +import { JsonPipeline } from "../pipelines/JsonPipeline"; import { MarkdownPipeline } from "../pipelines/MarkdownPipeline"; import type { ContentPipeline, ProcessedContent } from "../pipelines/types"; import type { ScraperOptions, ScraperProgress } from "../types"; @@ -20,6 +21,7 @@ export class WebScraperStrategy extends BaseScraperStrategy { private readonly shouldFollowLinkFn?: (baseUrl: URL, targetUrl: URL) => boolean; private readonly htmlPipeline: HtmlPipeline; private readonly markdownPipeline: MarkdownPipeline; + private readonly jsonPipeline: JsonPipeline; private readonly pipelines: ContentPipeline[]; constructor(options: WebScraperStrategyOptions = {}) { @@ -27,7 +29,8 @@ export class WebScraperStrategy extends BaseScraperStrategy { this.shouldFollowLinkFn = options.shouldFollowLink; this.htmlPipeline = new HtmlPipeline(); this.markdownPipeline = new MarkdownPipeline(); - this.pipelines = [this.htmlPipeline, this.markdownPipeline]; + this.jsonPipeline = new JsonPipeline(); + this.pipelines = [this.htmlPipeline, this.markdownPipeline, this.jsonPipeline]; } canHandle(url: string): boolean { diff --git a/src/splitter/SemanticMarkdownSplitter.test.ts b/src/splitter/SemanticMarkdownSplitter.test.ts index 96d3fd6..13b71a9 100644 --- a/src/splitter/SemanticMarkdownSplitter.test.ts +++ b/src/splitter/SemanticMarkdownSplitter.test.ts @@ -371,4 +371,36 @@ ${codeLines} // Each chunk should be under the max size expect(result.every((chunk) => chunk.content.length <= 20)).toBe(true); }); + + it("should split large JSON code blocks into valid JSON chunks", async () => { + const splitter = new SemanticMarkdownSplitter(10, 50); // small chunk size for test + const jsonArray = Array.from({ length: 10 }, (_, i) => ({ id: i, value: `val${i}` })); + const markdown = ["```json", JSON.stringify(jsonArray), "```"].join("\n"); + const result = await splitter.splitText(markdown); + // All chunks should be code, valid JSON, and within size + expect(result.length).toBeGreaterThan(1); + for (const chunk of result) { + expect(chunk.types).toEqual(["code"]); + expect(chunk.content).toMatch(/^```json\n[\s\S]*\n```$/); + // Extract JSON body + const body = chunk.content.replace(/^```json\n/, "").replace(/\n```$/, ""); + expect(() => JSON.parse(body)).not.toThrow(); + expect(chunk.content.length).toBeLessThanOrEqual(50); + } + }); + + it("should fall back to normal code splitting for non-JSON code blocks", async () => { + const splitter = new SemanticMarkdownSplitter(10, 50); + const codeLines = Array.from({ length: 10 }, (_, i) => `console.log(${i});`).join( + "\n", + ); + const markdown = ["```js", codeLines, "```"].join("\n"); + const result = await splitter.splitText(markdown); + expect(result.length).toBeGreaterThan(1); + for (const chunk of result) { + expect(chunk.types).toEqual(["code"]); + expect(chunk.content).toMatch(/^```js\n[\s\S]*\n```$/); + expect(chunk.content.length).toBeLessThanOrEqual(50); + } + }); }); diff --git a/src/splitter/SemanticMarkdownSplitter.ts b/src/splitter/SemanticMarkdownSplitter.ts index 130e844..cd008ff 100644 --- a/src/splitter/SemanticMarkdownSplitter.ts +++ b/src/splitter/SemanticMarkdownSplitter.ts @@ -10,6 +10,7 @@ import { logger } from "../utils/logger"; import { fullTrim } from "../utils/string"; import { ContentSplitterError, MinimumChunkSizeError } from "./errors"; import { CodeContentSplitter } from "./splitters/CodeContentSplitter"; +import { JsonContentSplitter } from "./splitters/JsonContentSplitter"; import { TableContentSplitter } from "./splitters/TableContentSplitter"; import { TextContentSplitter } from "./splitters/TextContentSplitter"; import type { ContentChunk, DocumentSplitter, SectionContentType } from "./types"; @@ -40,6 +41,7 @@ export class SemanticMarkdownSplitter implements DocumentSplitter { public textSplitter: TextContentSplitter; public codeSplitter: CodeContentSplitter; public tableSplitter: TableContentSplitter; + public jsonSplitter: JsonContentSplitter; constructor( private preferredChunkSize: number, @@ -97,6 +99,9 @@ export class SemanticMarkdownSplitter implements DocumentSplitter { this.tableSplitter = new TableContentSplitter({ chunkSize: this.maxChunkSize, }); + this.jsonSplitter = new JsonContentSplitter({ + chunkSize: this.maxChunkSize, + }); } /** @@ -233,7 +238,27 @@ export class SemanticMarkdownSplitter implements DocumentSplitter { break; } case "code": { - splitContent = await this.codeSplitter.split(content.text); + // Detect JSON code blocks + if (/^```json\s*/i.test(content.text)) { + // Remove code block markers for splitting + const jsonBody = content.text + .replace(/^```json\s*/i, "") + .replace(/```\s*$/, ""); + // Account for code block wrapper overhead + const wrapperSize = "```json\n".length + "\n```".length; // 9 + 4 = 13 + const allowedChunkSize = Math.max(1, this.maxChunkSize - wrapperSize); + // Use a temporary JsonContentSplitter with reduced chunk size + const jsonSplitter = new JsonContentSplitter({ + chunkSize: allowedChunkSize, + }); + splitContent = await jsonSplitter.split(jsonBody); + // Re-wrap as code blocks + splitContent = splitContent.map((chunk) => + ["```json", chunk, "```"].join("\n"), + ); + } else { + splitContent = await this.codeSplitter.split(content.text); + } break; } case "table": { diff --git a/src/splitter/splitters/JsonContentSplitter.test.ts b/src/splitter/splitters/JsonContentSplitter.test.ts new file mode 100644 index 0000000..a48dced --- /dev/null +++ b/src/splitter/splitters/JsonContentSplitter.test.ts @@ -0,0 +1,53 @@ +import { describe, expect, it } from "vitest"; +import { JsonContentSplitter } from "./JsonContentSplitter"; + +describe("JsonContentSplitter", () => { + const chunkSize = 50; + const splitter = new JsonContentSplitter({ chunkSize }); + + it("splits large arrays into valid JSON chunks", async () => { + const arr = Array.from({ length: 20 }, (_, i) => i); + const json = JSON.stringify(arr); + const chunks = await splitter.split(json); + for (const chunk of chunks) { + expect(JSON.parse(chunk)).toBeDefined(); + expect(chunk.length).toBeLessThanOrEqual(chunkSize); + } + expect(chunks.join("")).toContain("0"); + }); + + it("splits large objects into valid JSON chunks", async () => { + const obj = Object.fromEntries(Array.from({ length: 10 }, (_, i) => [`k${i}`, i])); + const json = JSON.stringify(obj); + const chunks = await splitter.split(json); + for (const chunk of chunks) { + expect(JSON.parse(chunk)).toBeDefined(); + expect(chunk.length).toBeLessThanOrEqual(chunkSize); + } + expect(chunks.join("")).toContain("k0"); + }); + + it("handles nested structures recursively", async () => { + const obj = { a: Array(10).fill({ b: Array(10).fill(1) }) }; + const json = JSON.stringify(obj); + const chunks = await splitter.split(json); + for (const chunk of chunks) { + expect(JSON.parse(chunk)).toBeDefined(); + expect(chunk.length).toBeLessThanOrEqual(chunkSize); + } + }); + + it("returns single chunk for small JSON", async () => { + const json = JSON.stringify({ a: 1 }); + const chunks = await splitter.split(json); + expect(chunks.length).toBe(1); + expect(JSON.parse(chunks[0])).toEqual({ a: 1 }); + }); + + it("returns input as single chunk for invalid JSON", async () => { + const input = "not a json"; + const chunks = await splitter.split(input); + expect(chunks.length).toBe(1); + expect(chunks[0]).toBe(input); + }); +}); diff --git a/src/splitter/splitters/JsonContentSplitter.ts b/src/splitter/splitters/JsonContentSplitter.ts new file mode 100644 index 0000000..c09d6d0 --- /dev/null +++ b/src/splitter/splitters/JsonContentSplitter.ts @@ -0,0 +1,91 @@ +import type { ContentSplitter, ContentSplitterOptions } from "./types"; + +/** + * Splits large JSON content into valid JSON chunks by recursively splitting arrays and objects. + * Ensures each chunk is a valid JSON string and does not exceed the specified chunk size. + * + * - For arrays: splits by elements, recursively splitting if needed. + * - For objects: splits by key-value pairs, recursively splitting if needed. + * - For primitives or large single elements: returns as a single chunk or throws if too large. + */ +export class JsonContentSplitter implements ContentSplitter { + constructor(private options: ContentSplitterOptions) {} + + /** + * Splits JSON content into valid JSON chunks respecting the chunk size. + * @param content JSON string + */ + async split(content: string): Promise { + let root: unknown; + try { + root = JSON.parse(content); + } catch (err) { + // If not valid JSON, return as a single chunk + return [content]; + } + return this.splitNode(root); + } + + private splitNode(node: unknown): string[] { + const json = JSON.stringify(node); + if (json.length <= this.options.chunkSize) { + return [json]; + } + if (Array.isArray(node)) { + return this.splitArray(node); + } + if (node && typeof node === "object") { + return this.splitObject(node as Record); + } + // Primitive too large, return as is + return [json]; + } + + private splitArray(arr: unknown[]): string[] { + const result: string[] = []; + let currentChunk: unknown[] = []; + for (const el of arr) { + const testChunk = [...currentChunk, el]; + const json = JSON.stringify(testChunk); + if (json.length > this.options.chunkSize) { + if (currentChunk.length === 0) { + // Single element too large, split recursively + result.push(...this.splitNode(el)); + } else { + result.push(JSON.stringify(currentChunk)); + currentChunk = [el]; + } + } else { + currentChunk.push(el); + } + } + if (currentChunk.length > 0) { + result.push(JSON.stringify(currentChunk)); + } + return result; + } + + private splitObject(obj: Record): string[] { + const result: string[] = []; + let currentChunk: Record = {}; + for (const [key, value] of Object.entries(obj)) { + const testChunk = { ...currentChunk, [key]: value }; + const json = JSON.stringify(testChunk); + if (json.length > this.options.chunkSize) { + if (Object.keys(currentChunk).length === 0) { + // Single property too large, split recursively + result.push(...this.splitNode(value)); + } else { + result.push(JSON.stringify(currentChunk)); + currentChunk = { [key]: value }; + } + } else { + currentChunk[key] = value; + } + } + if (Object.keys(currentChunk).length > 0) { + result.push(JSON.stringify(currentChunk)); + } + return result; + } +} diff --git a/src/tools/FetchUrlTool.test.ts b/src/tools/FetchUrlTool.test.ts index aec2168..b05b56a 100644 --- a/src/tools/FetchUrlTool.test.ts +++ b/src/tools/FetchUrlTool.test.ts @@ -1,5 +1,6 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; import type { FileFetcher, HttpFetcher } from "../scraper/fetcher"; +import { JsonPipeline } from "../scraper/pipelines/JsonPipeline"; import { ScraperError } from "../utils/errors"; import { logger } from "../utils/logger"; import { FetchUrlTool, type FetchUrlToolOptions } from "./FetchUrlTool"; @@ -174,4 +175,21 @@ describe("FetchUrlTool", () => { ); expect(result).toBe(imageBuffer.toString("utf-8")); }); + + it("should process .json files using the JsonPipeline", async () => { + mockHttpFetcher.canFetch = vi.fn().mockReturnValue(true); // Ensure fetcher is selected + mockHttpFetcher.fetch = vi.fn().mockResolvedValue({ + content: JSON.stringify({ x: [10, 20], y: { z: "ok" } }), + mimeType: "application/json", + source: "https://example.com/data.json", + }); + const tool = new FetchUrlTool( + mockHttpFetcher as HttpFetcher, + mockFileFetcher as FileFetcher, + ); + const result = await tool.execute({ url: "https://example.com/data.json" }); + const parsed = JSON.parse(result); + expect(parsed.x).toEqual([10, 20]); + expect(parsed.y).toEqual({ z: "ok" }); + }); }); diff --git a/src/tools/FetchUrlTool.ts b/src/tools/FetchUrlTool.ts index 7437dd0..3c847d3 100644 --- a/src/tools/FetchUrlTool.ts +++ b/src/tools/FetchUrlTool.ts @@ -5,10 +5,12 @@ import type { RawContent, } from "../scraper/fetcher"; import { HtmlPipeline } from "../scraper/pipelines/HtmlPipeline"; +import { JsonPipeline } from "../scraper/pipelines/JsonPipeline"; import { MarkdownPipeline } from "../scraper/pipelines/MarkdownPipeline"; import { ScrapeMode } from "../scraper/types"; import { ScraperError } from "../utils/errors"; import { logger } from "../utils/logger"; +import { MimeTypeUtils } from "../utils/mimeTypeUtils"; import { ToolError } from "./errors"; export interface FetchUrlToolOptions { @@ -72,7 +74,8 @@ export class FetchUrlTool { const fetcher = this.fetchers[fetcherIndex]; const htmlPipeline = new HtmlPipeline(); const markdownPipeline = new MarkdownPipeline(); - const pipelines = [htmlPipeline, markdownPipeline]; + const jsonPipeline = new JsonPipeline(); + const pipelines = [htmlPipeline, markdownPipeline, jsonPipeline]; try { logger.info(`📡 Fetching ${url}...`); diff --git a/src/utils/mimeTypeUtils.ts b/src/utils/mimeTypeUtils.ts index 98df530..b97fac1 100644 --- a/src/utils/mimeTypeUtils.ts +++ b/src/utils/mimeTypeUtils.ts @@ -55,5 +55,16 @@ export class MimeTypeUtils { return mimeType.startsWith("text/"); } - // Extend with more helpers as needed (isJson, isXml, isPdf, etc.) + /** + * Checks if a MIME type represents JSON content. + */ + public static isJson(mimeType: string): boolean { + return ( + mimeType === "application/json" || + mimeType === "application/ld+json" || + mimeType.endsWith("+json") || + mimeType === "text/json" || + mimeType === "application/json5" + ); + } }