-
-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Description
First of all, great work with this project! It's an impressive feat so far, despite the performance difference between this and native Tesseract.
Describe the bug
When using a Scheduler with multiple Workers, creating a GetPDF() job returns the same PDF document repeatedly, even though the Scheduler processes multiple different files.
To Reproduce
Steps to reproduce the behavior:
const {createWorker, createScheduler} = require('tesseract.js');
const fs = require('fs');
const scheduler = createScheduler();
const nWorkers = 4;
async function initialize() {
for (let i = 0; i < nWorkers; i++) {
let worker = createWorker({
cachePath: 'langs',
langPath: 'langs',
logger: m => console.log(m)
});
await worker.load();
await worker.loadLanguage('dan-fast');
await worker.initialize('dan-fast');
scheduler.addWorker(worker);
}
console.log('OCR initialized');
}
async function recognize(imagePath) {
const filePath = imagePath.split('/').slice(-1).pop();
const result = await scheduler.addJob('recognize', imagePath);
const {data} = await scheduler.addJob('getPDF', filePath);
fs.writeFileSync(`images/ocr-${filePath.replace('.png', '.pdf')}`, Buffer.from(data));
return result.data.text
}
initialize()
.then(() => {
const promises = [];
for (let i = 0; i < 3; i++) {
const promise = recognize(`images/page-${i+1}.png`)
promises.push(promise);
}
return Promise.all(promises)
})
.then((results) => {
results.forEach((res) => {
console.log('-----------');
console.log(res);
})
})
.then(() => {
return scheduler.terminate();
});
Expected behavior
GetPDF() should be able to produce the PDF file (or its byte representation) associated with a given recognition job, or the PDF byte representation should be part of the result from the recognition job.
The current operating model disables OCR-PDF rendering when using a scheduler, making processing a document with several pages more time-consuming.
Versions:
- OS: Mac OS 10.13.6
- Node: v12.18.1
- Tesseract.js: 2.1.3
