-
Notifications
You must be signed in to change notification settings - Fork 358
feat: Implement a PDF conversion script #2466 #2467
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
shaunyogeshwaran
wants to merge
8
commits into
main
Choose a base branch
from
shaun/main/pdf-script
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
c11007f
script and dependencies
shaunyogeshwaran d04d8b2
add cleanupDom()
shaunyogeshwaran c00f92d
fix tables
shaunyogeshwaran 03e19b7
Fix images
shaunyogeshwaran e2e227a
added working toc and pagination
shaunyogeshwaran ca1faae
comments
shaunyogeshwaran 31b484b
Update generate-pdf.js
shaunyogeshwaran 98380af
Update generate-pdf.js
shaunyogeshwaran File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,370 @@ | ||
| const puppeteer = require('puppeteer'); | ||
| const fs = require('fs'); | ||
| const { PDFDocument, rgb, StandardFonts, PDFName } = require('pdf-lib'); | ||
| const path = require('path'); | ||
|
|
||
| const START_URL = 'https://docs.h2o.ai/h2o-document-ai/get-started/what-is-h2o-document-ai'; | ||
| const PAGINATION_SELECTOR = 'a.pagination-nav__link.pagination-nav__link--next'; | ||
| const OUTPUT_FILENAME = 'doc-ai-documentation.pdf'; | ||
|
|
||
| async function getAllPageUrls(page, maxPages = Infinity) { | ||
| const urls = []; | ||
| let nextPageUrl = START_URL; | ||
|
|
||
| while (nextPageUrl) { | ||
| if (urls.length >= maxPages) break; | ||
| if (urls.includes(nextPageUrl)) break; | ||
|
|
||
| console.log(`Scraping: ${nextPageUrl}`); | ||
| await page.goto(nextPageUrl, { waitUntil: 'networkidle2' }); | ||
| urls.push(nextPageUrl); | ||
|
|
||
| const nextHref = await page.evaluate((sel) => { | ||
| const nextLink = document.querySelector(sel); | ||
| return nextLink?.href || null; | ||
| }, PAGINATION_SELECTOR); | ||
|
|
||
| if (!nextHref) break; | ||
| nextPageUrl = nextHref; | ||
| } | ||
|
|
||
| return urls; | ||
| } | ||
|
|
||
| async function cleanupDom(page) { | ||
| await page.evaluate(() => { | ||
| // Remove footer along w the cookie banners | ||
| document.querySelector('footer')?.remove(); | ||
| document.querySelector('section.notice')?.remove(); | ||
| document.querySelectorAll('.theme-admonition-note').forEach(el => el.remove()); | ||
| document.querySelectorAll('.theme-admonition.theme-admonition-note.alert.alert--secondary').forEach(el => el.remove()); | ||
|
|
||
|
|
||
| // to expand tabbed panels. This config works best but need to explore more options | ||
| document.querySelectorAll('.tabs-container').forEach(tabsContainer => { | ||
| const tabButtons = tabsContainer.querySelectorAll('[role="tab"]'); | ||
| const tabPanels = tabsContainer.querySelectorAll('[role="tabpanel"]'); | ||
|
|
||
| tabPanels.forEach((panel, i) => { | ||
| panel.style.display = 'block'; | ||
| panel.style.visibility = 'visible'; | ||
| panel.style.position = 'static'; | ||
| panel.style.height = 'auto'; | ||
| panel.style.opacity = '1'; | ||
|
|
||
| const originalTabButton = tabButtons[i]; | ||
| if (originalTabButton) { | ||
| const clonedTabButton = originalTabButton.cloneNode(true); | ||
| clonedTabButton.style.display = 'inline-block'; | ||
| clonedTabButton.style.padding = '6px 12px'; | ||
| clonedTabButton.style.marginBottom = '8px'; | ||
| clonedTabButton.style.border = '1px solid #ccc'; | ||
| clonedTabButton.style.borderRadius = '4px'; | ||
| clonedTabButton.style.backgroundColor = '#f0f0f0'; | ||
| clonedTabButton.style.color = '#333'; | ||
| clonedTabButton.style.fontWeight = 'bold'; | ||
| panel.parentNode.insertBefore(clonedTabButton, panel); | ||
| } | ||
| }); | ||
|
|
||
| const tabList = tabsContainer.querySelector('[role="tablist"]'); | ||
| if (tabList) tabList.style.display = 'none'; | ||
| }); | ||
|
|
||
| // Converting image URLs to absolute - we need to do this because some images are relative | ||
| document.querySelectorAll('img').forEach((img) => { | ||
| const src = img.getAttribute('src'); | ||
| if (src && !src.startsWith('http')) { | ||
| img.src = new URL(src, window.location.origin).href; | ||
| } | ||
| }); | ||
|
|
||
| // Converting relative links to absolute | ||
| document.querySelectorAll('a[href^="/"]').forEach((a) => { | ||
| a.href = new URL(a.getAttribute('href'), window.location.origin).href; | ||
| }); | ||
|
|
||
| // force inject CSS to fix wide tables - temporary fix | ||
| const style = document.createElement('style'); | ||
| style.textContent = ` | ||
| table { | ||
| table-layout: fixed !important; | ||
| width: 100% !important; | ||
| max-width: 100% !important; | ||
| border-collapse: collapse; | ||
| word-wrap: break-word; | ||
| overflow-wrap: break-word; | ||
| } | ||
| th, td { | ||
| word-break: break-word !important; | ||
| white-space: normal !important; | ||
| overflow-wrap: break-word !important; | ||
| max-width: 200px; /* Adjust max width per cell as needed */ | ||
| padding: 8px !important; | ||
| border: 1px solid #ddd !important; | ||
| } | ||
| /* Optional: wrap tables in a scroll container if needed */ | ||
| .table-wrapper { | ||
| overflow-x: auto; | ||
| -webkit-overflow-scrolling: touch; | ||
| margin-bottom: 1em; | ||
| } | ||
| `; | ||
| document.head.appendChild(style); | ||
|
|
||
| // wrap all tables in a wrapper for horizontal scroll | ||
| document.querySelectorAll('table').forEach(table => { | ||
| if (!table.parentNode.classList.contains('table-wrapper')) { | ||
| const wrapper = document.createElement('div'); | ||
| wrapper.className = 'table-wrapper'; | ||
| table.parentNode.insertBefore(wrapper, table); | ||
| wrapper.appendChild(table); | ||
| } | ||
| }); | ||
| }); | ||
| } | ||
|
|
||
| async function autoScroll(page) { | ||
| await page.evaluate(async () => { | ||
| await new Promise((resolve) => { | ||
| let totalHeight = 0; | ||
| const distance = 100; | ||
| const timer = setInterval(() => { | ||
| window.scrollBy(0, distance); | ||
| totalHeight += distance; | ||
| if (totalHeight >= document.body.scrollHeight) { | ||
| clearInterval(timer); | ||
| resolve(); | ||
| } | ||
| }, 100); | ||
| }); | ||
| }); | ||
| } | ||
|
|
||
|
|
||
| async function generatePdfBuffers(page, urls) { | ||
| const pdfBuffers = []; | ||
| page.setDefaultNavigationTimeout(120_000); | ||
| page.setDefaultTimeout(120_000); | ||
|
|
||
| for (const [index, url] of urls.entries()) { | ||
| try { | ||
| console.log(`Generating PDF for ${url}`); | ||
| await page.goto(url, { waitUntil: 'networkidle2' }); | ||
|
|
||
| await cleanupDom(page); | ||
| await autoScroll(page); | ||
|
|
||
| // extracting page title for TOC - need to check with a different selector | ||
| const pageTitle = await page.evaluate(() => { | ||
| return document.querySelector('h1')?.textContent || document.title; | ||
| }); | ||
|
|
||
| // wait for all images to load before generating PDF | ||
| await page.evaluate(async () => { | ||
| const images = Array.from(document.images); | ||
| await Promise.all(images.map(img => | ||
| img.complete ? Promise.resolve() : | ||
| new Promise((resolve, reject) => { | ||
| img.onload = resolve; | ||
| img.onerror = reject; | ||
| }) | ||
| )); | ||
| }); | ||
|
|
||
| const pdfBuffer = await page.pdf({ | ||
| format: 'A4', | ||
| printBackground: true, | ||
| preferCSSPageSize: true, | ||
| margin: { | ||
| top: '25mm', | ||
| bottom: '20mm', | ||
| left: '15mm', | ||
| right: '15mm', | ||
| }, | ||
| displayHeaderFooter: true, | ||
| headerTemplate: ` | ||
| <style> | ||
| .header { | ||
| font-size: 12px; | ||
| color: #333; | ||
| width: 100%; | ||
| text-align: center; | ||
| padding-bottom: 8px; | ||
| } | ||
| </style> | ||
| <div class="header">H2O Document AI Documentation</div> | ||
| `, | ||
| // This footer is empty. Doesnt work if removed | ||
| footerTemplate: ` | ||
| <div class="footer"> | ||
| </div> | ||
| `, | ||
| }); | ||
|
|
||
|
|
||
| pdfBuffers.push({ | ||
| buffer: pdfBuffer, | ||
| title: pageTitle, | ||
| url: url | ||
| }); | ||
| } catch (error) { | ||
| console.error(`Error generating PDF for ${url}: ${error.message}`); | ||
| } | ||
| } | ||
|
|
||
| return pdfBuffers; | ||
| } | ||
|
|
||
| async function createToc(pdfEntries) { | ||
| const tocDoc = await PDFDocument.create(); | ||
| const page = tocDoc.addPage([595, 842]); // A4 size | ||
|
|
||
| // Add the TOC title - experiment with different fonts and sizes | ||
| page.drawText('Table of Contents', { | ||
| x: 50, | ||
| y: 750, | ||
| size: 18, | ||
| color: rgb(0, 0, 0) | ||
| }); | ||
|
|
||
| // Add entries | ||
| let yPosition = 700; | ||
| let currentPage = 2; // to only start after TOC and cover page | ||
|
|
||
| for (const entry of pdfEntries) { | ||
| const loadedPdf = await PDFDocument.load(entry.buffer); | ||
| const pageCount = loadedPdf.getPageCount(); | ||
|
|
||
| page.drawText(entry.title, { | ||
| x: 50, | ||
| y: yPosition, | ||
| size: 12 | ||
| }); | ||
|
|
||
| page.drawText(currentPage.toString(), { | ||
| x: 500, | ||
| y: yPosition, | ||
| size: 12 | ||
| }); | ||
|
|
||
| yPosition -= 20; | ||
| currentPage += pageCount; | ||
| } | ||
|
|
||
| return await tocDoc.save(); | ||
| } | ||
|
|
||
|
|
||
| async function mergePdfsWithToc(pdfEntries) { | ||
| const mergedPdf = await PDFDocument.create(); | ||
| const tocPdf = await createToc(pdfEntries); | ||
|
|
||
| //loading the TOC PDF and add its pages first, otherwise it wont be formatted correctly | ||
| const tocDoc = await PDFDocument.load(tocPdf); | ||
| const tocPages = await mergedPdf.copyPages(tocDoc, tocDoc.getPageIndices()); | ||
| tocPages.forEach(page => mergedPdf.addPage(page)); | ||
|
|
||
|
|
||
| const sectionFirstPages = {}; | ||
|
|
||
| // Add content pages and record first page for each section | ||
| for (const entry of pdfEntries) { | ||
| const pdf = await PDFDocument.load(entry.buffer); | ||
| const pages = await mergedPdf.copyPages(pdf, pdf.getPageIndices()); | ||
| pages.forEach((page, i) => { | ||
| mergedPdf.addPage(page); | ||
| if (i === 0) { | ||
| // Store the page ref for this sections first page | ||
| sectionFirstPages[entry.title] = page; | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| const tocPage = mergedPdf.getPage(0); | ||
| const tocFont = await mergedPdf.embedFont(StandardFonts.Helvetica); | ||
| let tocY = 700; | ||
|
|
||
| for (const entry of pdfEntries) { | ||
| const targetPage = sectionFirstPages[entry.title]; | ||
| if (!targetPage) continue; | ||
|
|
||
| // should find a better way to get the page number | ||
| tocPage.drawText(entry.title, { | ||
| x: 50, | ||
| y: tocY, | ||
| size: 12, | ||
| font: tocFont, | ||
| color: rgb(0, 0, 1), | ||
| }); | ||
|
|
||
|
|
||
| const annotation = mergedPdf.context.obj({ | ||
| Type: 'Annot', | ||
| Subtype: 'Link', | ||
| Rect: [50, tocY, 300, tocY + 15], | ||
| Border: [0, 0, 0], | ||
| A: { | ||
| Type: 'Action', | ||
| S: 'GoTo', | ||
| D: [targetPage.ref, 'Fit'], | ||
| }, | ||
| }); | ||
|
|
||
| const annots = tocPage.node.lookup(PDFName.of('Annots')) || mergedPdf.context.obj([]); | ||
| annots.push(annotation); | ||
| tocPage.node.set(PDFName.of('Annots'), annots); | ||
|
|
||
| tocY -= 20; | ||
| } | ||
|
|
||
| const totalPages = mergedPdf.getPageCount(); | ||
| const font = await mergedPdf.embedFont(StandardFonts.Helvetica); | ||
| const fontSize = 10; | ||
| const marginBottom = 20; | ||
|
|
||
| for (let i = 0; i < totalPages; i++) { | ||
| const page = mergedPdf.getPage(i); | ||
| const { width, height } = page.getSize(); | ||
|
|
||
| const text = `Page ${i + 1} of ${totalPages}`; | ||
|
|
||
| page.drawText(text, { | ||
| x: width / 2 - (font.widthOfTextAtSize(text, fontSize) / 2), | ||
| y: marginBottom, | ||
| size: fontSize, | ||
| font: font, | ||
| color: rgb(0.5, 0.5, 0.5), | ||
| }); | ||
| } | ||
|
|
||
| return await mergedPdf.save(); | ||
| } | ||
|
|
||
|
|
||
| async function main() { | ||
| const browser = await puppeteer.launch({ | ||
| headless: 'new', | ||
| args: ['--font-render-hinting=none'] | ||
| }); | ||
| const page = await browser.newPage(); | ||
|
|
||
| try { | ||
| const MAX_PAGES = Infinity; // Only for testing, set to Infinity for all pages when actually generating the PDF | ||
| const urls = await getAllPageUrls(page, MAX_PAGES); | ||
| console.log(`Found ${urls.length} pages.`); | ||
|
|
||
| const pdfEntries = await generatePdfBuffers(page, urls); | ||
| const finalPdf = await mergePdfsWithToc(pdfEntries); | ||
|
|
||
| fs.writeFileSync(OUTPUT_FILENAME, finalPdf); | ||
| console.log(`Success. PDF saved as ${OUTPUT_FILENAME}`); | ||
| } catch (err) { | ||
| console.error('Failed. Error generating PDF:', err); | ||
| process.exit(1); | ||
| } finally { | ||
| await browser.close(); | ||
| } | ||
| } | ||
|
|
||
| main(); | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Check failure
Code scanning / CodeQL
Incomplete URL substring sanitization High
Copilot Autofix
AI 6 months ago
To address the issue, we need to validate that
nextPageUrlbelongs to the trusted domain (docs.h2o.ai) before adding it to theurlsarray. This can be achieved by parsing the URL using theURLconstructor and checking itshostnameproperty against the trusted domain. This approach ensures that only URLs with the correct host are processed, mitigating the risk of malicious URLs bypassing the check.The changes will involve:
nextPageUrlusing theURLconstructor.hostnameofnextPageUrlmatches the trusted domain (docs.h2o.ai).