-
-
Notifications
You must be signed in to change notification settings - Fork 86
/
PdfReader.js
83 lines (76 loc) · 2.78 KB
/
PdfReader.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/**
* PdfReader: class that reads a PDF file, and calls a function on each item found while parsing that file.
* @author Adrien Joly, http://github.com/adrienjoly
* This content is released under the MIT License.
*
* An item object can match one of the following objects:
* - null, when the parsing is over, or an error occured.
* - {file:{path:string}}, when a PDF file is being opened.
* - {page:integer}, when a new page is being parsed, provides the page number, starting at 1.
* - {text:string, x:float, y:float, w:float, h:float...}, represents each text with its position.
*
**/
import { log as LOG } from "./lib/LOG.js";
import PDFParser from "pdf2json"; // doc: https://github.com/modesty/pdf2json
function forEachItem(pdf, handler) {
var pageNumber = 0;
// pdf.formImage was removed in pdf2json@2, but we keep backward compatibility too
var Pages = pdf.Pages || pdf.formImage.Pages;
for (var p in Pages) {
var page = Pages[p];
var number = ++pageNumber;
handler(null, {
page: number,
width: page.Width || (pdf.formImage ? pdf.formImage.Width : 0),
height:
page.Height ||
(pdf.formImage ? pdf.formImage.Pages[number - 1].Height : 0),
});
for (var t in page.Texts) {
var item = page.Texts[t];
item.text = decodeURIComponent(item.R[0].T);
handler(null, item);
}
}
handler();
}
export function PdfReader(options) {
LOG("PdfReader"); // only displayed if LOG.js was first loaded with `true` as init parameter
this.options = options || {};
}
/**
* parseFileItems: calls itemHandler(error, item) on each item parsed from the pdf file
**/
PdfReader.prototype.parseFileItems = function (pdfFilePath, itemHandler) {
itemHandler(null, { file: { path: pdfFilePath } });
var pdfParser;
if (this.options.password) {
pdfParser = new PDFParser(null, null, this.options.password);
} else {
pdfParser = new PDFParser();
}
pdfParser.on("pdfParser_dataError", itemHandler);
pdfParser.on("pdfParser_dataReady", function (pdfData) {
forEachItem(pdfData, itemHandler);
});
var verbosity = this.options.debug ? 1 : 0;
pdfParser.loadPDF(pdfFilePath, verbosity);
};
/**
* parseBuffer: calls itemHandler(error, item) on each item parsed from the pdf file received as a buffer
*/
PdfReader.prototype.parseBuffer = function (pdfBuffer, itemHandler) {
itemHandler(null, { file: { buffer: pdfBuffer } });
var pdfParser;
if (this.options.password) {
pdfParser = new PDFParser(null, null, this.options.password);
} else {
pdfParser = new PDFParser();
}
pdfParser.on("pdfParser_dataError", itemHandler);
pdfParser.on("pdfParser_dataReady", function (pdfData) {
forEachItem(pdfData, itemHandler);
});
var verbosity = this.options.debug ? 1 : 0;
pdfParser.parseBuffer(pdfBuffer, verbosity);
};