changes to xpdf

Hi all,


I want pdftohtml (http://www.foolabs.com/xpdf/download.html) to output a placeholder/marker for Path/Picture commands. This is useful for "OCR" on documents where lines delimit paragraphs or publications, as it is the case for me.

Solution:

In the beginning of Gfx::Go I define 4 Objekts (BT=Begin Text, Tf=Set Textfont, Tj=Write Text, ET=End Text):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
  char BTchar[2];
  BTchar[0] = 'B';
  BTchar[1] = 'T';
  BTchar[2] = '\0';
  char *BTptr = &BTchar[0];
  Object BTobj;
  BTobj.initCmd(BTptr);
  Object BTargs[0];
  printf("BTobj ");
  BTobj.print(stdout);
  printf("-----------------------------------\n");
  execOp(&BTobj, BTargs, 0);

  char Tfchar[2];
  Tfchar[0] = 'T';
  Tfchar[1] = 'f';
  Tfchar[2] = '\0';
  char *Tfptr = &Tfchar[0];
  Object Tfobj;
  Tfobj.initCmd(Tfptr);
  Object Tfargs[1];
  Tfargs[0].initName((char*) 'F0\0');
  Tfargs[1].initInt(12);
  printf("Tfobj ");
  Tfobj.print(stdout);
  printf("-----------------------------------\n");
  execOp(&Tfobj, Tfargs, 2);

  char Tjchar[2];
  Tjchar[0] = 'T';
  Tjchar[1] = 'j';
  Tjchar[2] = '\0';
  char *Tjptr = &Tjchar[0];
  Object Tjobj;
  Tjobj.initCmd(Tjptr);
//  Tjobj.initCmd((char) 'Tj\0');
  Object Tjargs[0];
  Tjargs[0].initString(new GString((char*) 'ABC'));
  printf("Tjobj ");
  Tjobj.print(stdout);
  printf("-----------------------------------\n");
  execOp(&Tjobj, Tjargs, 1);

  char ETchar[2];
  ETchar[0] = 'E';
  ETchar[1] = 'T';
  ETchar[2] = '\0';
  char *ETptr = &ETchar[0];
  Object ETobj;
  ETobj.initCmd(ETptr);
  Object ETargs[0];
  printf("ETobj ");
  ETobj.print(stdout);
  printf("-----------------------------------\n");
  execOp(&ETobj, BTargs, 0);


Before execOp(&obj, args, numArgs); I want to test for the "l" opcode and output my 4 objects:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
if (strcmp("l", obj.getCmd()) == 0) {

  printf("BTobj ");
  BTobj.print(stdout);
  printf("-----------------------------------\n");
  execOp(&BTobj, BTargs, 0);

  printf("Tfobj ");
  Tfobj.print(stdout);
  printf("-----------------------------------\n");
  execOp(&Tfobj, Tfargs, 2);

  printf("Tjobj ");
  Tjobj.print(stdout);
  printf("-----------------------------------\n");
  execOp(&Tjobj, Tjargs, 1);

  printf("ETobj ");
  ETobj.print(stdout);
  printf("-----------------------------------\n");
  execOp(&ETobj, BTargs, 0);

}


The application does compile, the output is as follows:

l 0 716.79
BTobj BT-----------------------------------
Tfobj 12-----------------------------------
Error (484516): Unknown operator ''
Tjobj Tj-----------------------------------
ETobj ET-----------------------------------

"l" commands are properly tested, but my Tf object produces an error ....

Is there maybe a Unicode issue?

The kind of document I want to OCR can be downloaded under http://www.etat.lu/memorial/memorial/2009/C/Pdf/c0002021.pdf

I spend already a week on this, but I really do not know any further! Thanks a lot for your time!
Topic archived. No new replies allowed.