Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions packages/pdf/src/__tests__/fm-sinhala.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import { describe, it, expect } from "vitest";
import type { StructuredText } from "mupdf";
import {
isFMFont,
convertFMToUnicode,
extractTextFromStructuredText,
} from "../fm-sinhala.js";

describe("isFMFont", () => {
it("detects FM Sinhala font names", () => {
expect(isFMFont("DJCUQE+FMSamanthax")).toBe(true);
expect(isFMFont("FMAbhaya")).toBe(true);
expect(isFMFont("ABCDEF+FMMalithi")).toBe(true);
});

it("rejects non-FM font names", () => {
expect(isFMFont("TimesNewRomanPSMT")).toBe(false);
expect(isFMFont("Arial")).toBe(false);
expect(isFMFont("CPJXEB+TimesNewRomanPSMT")).toBe(false);
expect(isFMFont("")).toBe(false);
});
});

describe("convertFMToUnicode", () => {
it("converts FM-encoded Sri Lanka national anthem title", () => {
// "Y%S ,xld cd;sl .Sh" → "ශ්‍රී ලංකා ජාතික ගීය"
const result = convertFMToUnicode("Y%S ,xld cd;sl .Sh");
expect(result).toBe("ශ්‍රී ලංකා ජාතික ගීය");
});

it("preserves digits and spaces", () => {
const result = convertFMToUnicode("2020");
expect(result).toBe("2020");
});

it("handles mixed digits and FM text", () => {
const result = convertFMToUnicode("1' ");
// 1 stays, ' → . (period), space stays
expect(result).toContain("1");
});

it("returns empty string unchanged", () => {
expect(convertFMToUnicode("")).toBe("");
});

it("handles multi-character FM sequences correctly", () => {
// "l%d" → "ක්‍රා" (conjunct: ka + virama + ZWJ + ra + aa)
const result = convertFMToUnicode("l%d");
expect(result).toBe("ක්‍රා");
});
});

interface MockLine {
fontName: string;
text: string;
}

function createMockStructuredText(input: {
asText: string;
walkFontNames: string[];
blocks: Array<{ type: string; lines: MockLine[] }>;
}): StructuredText {
return {
asText() {
return input.asText;
},
asJSON() {
return JSON.stringify({
blocks: input.blocks.map((block) => ({
type: block.type,
lines: block.lines.map((line) => ({
font: { name: line.fontName },
text: line.text,
})),
})),
});
},
walk(walker) {
for (const fontName of input.walkFontNames) {
walker.onChar?.(
"x",
{ x: 0, y: 0 },
{ getName: () => fontName },
12,
{ ul: { x: 0, y: 0 }, ur: { x: 0, y: 0 }, ll: { x: 0, y: 0 }, lr: { x: 0, y: 0 } },
[0, 0, 0]
);
}
},
} as unknown as StructuredText;
}

describe("extractTextFromStructuredText", () => {
it("collapses extra newlines in non-FM text", () => {
const stext = createMockStructuredText({
asText: "alpha\n\n\n\nbeta",
walkFontNames: ["TimesNewRomanPSMT"],
blocks: [],
});

expect(extractTextFromStructuredText(stext)).toBe("alpha\n\nbeta");
});

it("converts FM text and collapses extra newlines", () => {
const stext = createMockStructuredText({
asText: "",
walkFontNames: ["DJCUQE+FMSamanthax"],
blocks: [
{
type: "text",
lines: [
{ fontName: "DJCUQE+FMSamanthax", text: "Y%S" },
{ fontName: "DJCUQE+FMSamanthax", text: "" },
{ fontName: "DJCUQE+FMSamanthax", text: "" },
{ fontName: "DJCUQE+FMSamanthax", text: "" },
],
},
{
type: "text",
lines: [{ fontName: "DJCUQE+FMSamanthax", text: "l%d" }],
},
],
});

expect(extractTextFromStructuredText(stext)).toBe("ශ්‍රී\n\nක්‍රා");
});

it("converts only FM lines when a page has mixed fonts", () => {
const stext = createMockStructuredText({
asText: "",
walkFontNames: ["DJCUQE+FMSamanthax", "TimesNewRomanPSMT"],
blocks: [
{
type: "text",
lines: [
{ fontName: "DJCUQE+FMSamanthax", text: "Y%S" },
{ fontName: "TimesNewRomanPSMT", text: "Chapter 1" },
{ fontName: "DJCUQE+FMSamanthax", text: "l%d" },
],
},
],
});

expect(extractTextFromStructuredText(stext)).toBe("ශ්‍රී\nChapter 1\nක්‍රා");
});
});
11 changes: 6 additions & 5 deletions packages/pdf/src/extract.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import mupdf, {
type PDFObject,
} from "mupdf";
import { cropPng, decodePng, stitchPngsHorizontally } from "./png-utils.js";
import { extractTextFromStructuredText } from "./fm-sinhala.js";
import { renderSvgToPng } from "./svg-render.js";

// ============================================================================
Expand Down Expand Up @@ -318,9 +319,9 @@ async function extractPage(doc: MupdfDocument, pageIndex: number): Promise<Extra
hash: hashBuffer(pagePngBuf),
};

// Extract text
// Extract text (handles legacy FM Sinhala font remapping when detected)
const stext = page.toStructuredText();
const text = stext.asText();
const text = extractTextFromStructuredText(stext);

// Extract raster images directly from PDF objects (not SVG)
const pdfDoc = doc as unknown as PDFDocument;
Expand Down Expand Up @@ -379,9 +380,9 @@ async function extractSpreadPage(
hash: hashBuffer(pagePngBuf),
};

// Concatenate text from both pages
const leftText = leftPage.toStructuredText().asText();
const rightText = rightPage.toStructuredText().asText();
// Concatenate text from both pages (handles legacy FM Sinhala font remapping)
const leftText = extractTextFromStructuredText(leftPage.toStructuredText());
const rightText = extractTextFromStructuredText(rightPage.toStructuredText());
const text = leftText + "\n" + rightText;

// Extract raster images from both pages
Expand Down
Loading