Hi all,
I want pdftohtml (
http://www.foolabs.com/xpdf/download.html) to output a placeholder/marker for Path/Picture commands. This is useful for "OCR" on documents where lines delimit paragraphs or publications, as it is the case for me.
Solution:
In the beginning of Gfx::Go I define 4 Objekts (BT=Begin Text, Tf=Set Textfont, Tj=Write Text, ET=End Text):
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
|
char BTchar[2];
BTchar[0] = 'B';
BTchar[1] = 'T';
BTchar[2] = '\0';
char *BTptr = &BTchar[0];
Object BTobj;
BTobj.initCmd(BTptr);
Object BTargs[0];
printf("BTobj ");
BTobj.print(stdout);
printf("-----------------------------------\n");
execOp(&BTobj, BTargs, 0);
char Tfchar[2];
Tfchar[0] = 'T';
Tfchar[1] = 'f';
Tfchar[2] = '\0';
char *Tfptr = &Tfchar[0];
Object Tfobj;
Tfobj.initCmd(Tfptr);
Object Tfargs[1];
Tfargs[0].initName((char*) 'F0\0');
Tfargs[1].initInt(12);
printf("Tfobj ");
Tfobj.print(stdout);
printf("-----------------------------------\n");
execOp(&Tfobj, Tfargs, 2);
char Tjchar[2];
Tjchar[0] = 'T';
Tjchar[1] = 'j';
Tjchar[2] = '\0';
char *Tjptr = &Tjchar[0];
Object Tjobj;
Tjobj.initCmd(Tjptr);
// Tjobj.initCmd((char) 'Tj\0');
Object Tjargs[0];
Tjargs[0].initString(new GString((char*) 'ABC'));
printf("Tjobj ");
Tjobj.print(stdout);
printf("-----------------------------------\n");
execOp(&Tjobj, Tjargs, 1);
char ETchar[2];
ETchar[0] = 'E';
ETchar[1] = 'T';
ETchar[2] = '\0';
char *ETptr = &ETchar[0];
Object ETobj;
ETobj.initCmd(ETptr);
Object ETargs[0];
printf("ETobj ");
ETobj.print(stdout);
printf("-----------------------------------\n");
execOp(&ETobj, BTargs, 0);
|
Before
execOp(&obj, args, numArgs);
I want to test for the "l" opcode and output my 4 objects:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
|
if (strcmp("l", obj.getCmd()) == 0) {
printf("BTobj ");
BTobj.print(stdout);
printf("-----------------------------------\n");
execOp(&BTobj, BTargs, 0);
printf("Tfobj ");
Tfobj.print(stdout);
printf("-----------------------------------\n");
execOp(&Tfobj, Tfargs, 2);
printf("Tjobj ");
Tjobj.print(stdout);
printf("-----------------------------------\n");
execOp(&Tjobj, Tjargs, 1);
printf("ETobj ");
ETobj.print(stdout);
printf("-----------------------------------\n");
execOp(&ETobj, BTargs, 0);
}
|
The application does compile, the output is as follows:
l 0 716.79
BTobj BT-----------------------------------
Tfobj 12-----------------------------------
Error (484516): Unknown operator ''
Tjobj Tj-----------------------------------
ETobj ET-----------------------------------
"l" commands are properly tested, but my Tf object produces an error ....
Is there maybe a Unicode issue?
The kind of document I want to OCR can be downloaded under
http://www.etat.lu/memorial/memorial/2009/C/Pdf/c0002021.pdf
I spend already a week on this, but I really do not know any further! Thanks a lot for your time!