add PDF text extraction (for pages)

this will return data in the way that djvu.c does already.
hopefully, this will permit us to re-use the highlighting code
(and factor it out into unireader.lua)
pull/2/merge
HW 12 years ago
parent da6c2309d2
commit 80a6e0210b

120
pdf.c

@ -311,6 +311,123 @@ static int openPage(lua_State *L) {
return 1;
}
/* get the text of the given page
*
* will return text in a Lua table that is modeled after
* djvu.c creates this table.
*
* note that the definition of "line" is somewhat arbitrary
* here (for now)
*
* MuPDFs API provides text as single char information
* that is collected in "spans". we use a span as a "line"
* in Lua output and segment spans into words by looking
* for space characters.
*
* will return an empty table if we have no text
*/
static int getPageText(lua_State *L) {
fz_text_span *page_text;
fz_text_span *ptr;
fz_device *tdev;
fz_bbox bbox, linebbox;
fz_matrix ctm;
int i;
int word, line;
int len, c;
int start;
char chars[4]; // max length of UTF-8 encoded rune
luaL_Buffer textbuf;
PdfPage *page = (PdfPage*) luaL_checkudata(L, 1, "pdfpage");
/* returned coordinates are in centi-point (n * 0.01 pt) */
ctm = fz_scale(100, 100);
page_text = fz_new_text_span(page->doc->context);
tdev = fz_new_text_device(page->doc->context, page_text);
fz_run_page(page->doc->xref, page->page, tdev, ctm, NULL);
fz_free_device(tdev);
/* table that contains all the lines */
lua_newtable(L);
ptr = page_text;
line = 1;
while(ptr) {
/* table for the words */
lua_newtable(L);
word = 1;
linebbox = ptr->text[0].bbox; // start with sensible default
for(i = 0; i < ptr->len; ) {
/* will hold information about a word: */
lua_newtable(L);
luaL_buffinit(L, &textbuf);
bbox = ptr->text[i].bbox; // start with sensible default
for(; i < ptr->len; i++) {
/* check for space characters */
if(ptr->text[i].c == ' ' ||
ptr->text[i].c == '\t' ||
ptr->text[i].c == '\n' ||
ptr->text[i].c == '\v' ||
ptr->text[i].c == '\f' ||
ptr->text[i].c == '\r' ) {
// ignore and end word
i++;
break;
}
len = runetochar(chars, &ptr->text[i].c);
for(c = 0; c < len; c++) {
luaL_addchar(&textbuf, chars[c]);
}
bbox = fz_union_bbox(bbox, ptr->text[i].bbox);
linebbox = fz_union_bbox(linebbox, ptr->text[i].bbox);
}
lua_pushstring(L, "word");
luaL_pushresult(&textbuf);
lua_settable(L, -3);
/* bbox for a word: */
lua_pushstring(L, "x0");
lua_pushinteger(L, bbox.x0);
lua_settable(L, -3);
lua_pushstring(L, "y0");
lua_pushinteger(L, bbox.y0);
lua_settable(L, -3);
lua_pushstring(L, "x1");
lua_pushinteger(L, bbox.x1);
lua_settable(L, -3);
lua_pushstring(L, "y1");
lua_pushinteger(L, bbox.y1);
lua_settable(L, -3);
lua_rawseti(L, -2, word++);
}
/* bbox for a whole line (or in fact, a "span") */
lua_pushstring(L, "x0");
lua_pushinteger(L, linebbox.x0);
lua_settable(L, -3);
lua_pushstring(L, "y0");
lua_pushinteger(L, linebbox.y0);
lua_settable(L, -3);
lua_pushstring(L, "x1");
lua_pushinteger(L, linebbox.x1);
lua_settable(L, -3);
lua_pushstring(L, "y1");
lua_pushinteger(L, linebbox.y1);
lua_settable(L, -3);
lua_rawseti(L, -2, line++);
ptr = ptr->next;
}
fz_free_text_span(page->doc->context, page_text);
return 1;
}
static int getPageSize(lua_State *L) {
fz_matrix ctm;
fz_rect bounds;
@ -323,7 +440,7 @@ static int getPageSize(lua_State *L) {
ctm = fz_concat(ctm, fz_rotate(dc->rotate));
bbox = fz_transform_rect(ctm, bounds);
lua_pushnumber(L, bbox.x1-bbox.x0);
lua_pushnumber(L, bbox.x1-bbox.x0);
lua_pushnumber(L, bbox.y1-bbox.y0);
return 2;
@ -456,6 +573,7 @@ static const struct luaL_Reg pdfdocument_meth[] = {
static const struct luaL_Reg pdfpage_meth[] = {
{"getSize", getPageSize},
{"getUsedBBox", getUsedBBox},
{"getPageText", getPageText},
{"close", closePage},
{"__gc", closePage},
{"draw", drawPage},

Loading…
Cancel
Save