|
|
PDF OCR SDK Reference Manual
Download Evaluation Version
Purchase Full Version
typedef
struct
STEXTPOStag{
int x; //left position for
the character or word
int y; //top position for
the character or word
int width; //width for the
character or word
int height; //height position for
the character or word
char text[500]; //text contents
}STEXTPOS;
int
WINAPI Image2PDFOCR_SinglePage_GetTextInfo(char
*lpszPDFFile, LPBYTE *lpChars,
char *lpszOptions)
Description
This function does OCR on a PDF file or image file, it will
return text information to calling application, this function does
support single page PDF file only.
Parameters
lpszPDFFile
[in] Input PDF filename.
lpChars
[out] OCRed text contents, it is a pointer to the
STEXTPOS structure.
lpszOptions
[in] Set the options for OCR process, this parameter
does support following options,
-pidpi: Set the DPI resolution for render PDF page and
OCR.
-firstpg: First page to be OCRed.
-lastpg: Last page to be OCRed.
Return Values
If the function succeeds, the return value is the number of
STEXTPOS structure which output by lpChars parameter.
Example
LPBYTE lpChars = NULL;
int
nCharCount = Image2PDFOCR_SinglePage_GetTextInfo(szInFile, &lpChars,
"-pidpi 300 -firstpg 1 -lastpg
1");
STEXTPOS *lpTextPos = (STEXTPOS*)lpChars;
for(int
i = 0; i < nCharCount; i++)
{
printf("%d,%d,%d,%d,%s\n",
lpTextPos[i].x, lpTextPos[i].y, lpTextPos[i].width,
lpTextPos[i].height, lpTextPos[i].text);
}
void WINAPI
Image2PDFOCR_SinglePage_FreeTextInfo(LPBYTE lpChars)
Description
Deallocates or frees a memory block.
Parameters
lpChars
[in] OCRed text contents, it is a pointer to the
STEXTPOS structure, it is returned by
Image2PDFOCR_SinglePage_GetTextInfo function.
Return Values
None.
Example
Image2PDFOCR_SinglePage_FreeTextInfo(lpChars);
int WINAPI
Image2PDFOCR_SinglePage_CreatePDF(char
*lpszInPDFFile, LPBYTE lpChars,
int nCharCount,
char *lpszOutPDFFile,
char *lpszOptions)
Description
Create searchable PDF file.
Parameters
lpszInPDFFile
[in] Input PDF filename.
lpChars
[in] OCRed text contents, it is a pointer to the
STEXTPOS structure.
nCharCount
[in] The number of STEXTPOS structure.
lpszOutPDFFile
[out] Create output searchable PDF filename.
lpszOptions
[in] Set the options for PDF creating, this parameter
does support following options,
-pidpi: Set the DPI resolution for render PDF page and
OCR, the value of this parameter should same as -pidpi option
in Image2PDFOCR_SinglePage_GetTextInfo function.
-firstpg: First page to merge the text information.
-lastpg: Last page to merge the text information.
Return Values
If the function succeeds, the return value is zero. If the
function fails, the return value is one of following values,
-1: Can't find input PDF file.
-2: Can't load DLL files correctly.
-3: Something is wrong in lpChars structure.
Example
char
szInFile[MAX_PATH] = {0};
char
szOutFile[MAX_PATH] = {0};
GetModulePath(szInFile,"test2.pdf");
GetModulePath(szOutFile,"test2_pdf_ocred-singlepage.pdf");
LPBYTE lpChars = NULL;
int
nCharCount = Image2PDFOCR_SinglePage_GetTextInfo(szInFile, &lpChars,
"-pidpi 300 -firstpg 1 -lastpg
1");
STEXTPOS *lpTextPos = (STEXTPOS*)lpChars;
for(int
i = 0; i < nCharCount; i++)
{
printf("%d,%d,%d,%d,%s\n",
lpTextPos[i].x, lpTextPos[i].y, lpTextPos[i].width,
lpTextPos[i].height, lpTextPos[i].text);
}
nRet = Image2PDFOCR_SinglePage_CreatePDF(szInFile, lpChars,
nCharCount, szOutFile,
"-pidpi
300 -firstpg 1 -lastpg 1");
Image2PDFOCR_SinglePage_FreeTextInfo(lpChars);
printf("Example #2 return
'%d'\n", nRet);
int
WINAPI Image2PDFOCR_PDFCmd(char
*lpszCmd)
Description
Process PDF file by some special commands.
Parameters
lpszCmd
[in] Input Command Line to process the PDF file, it is
support following commands,
-mergepdf: merge more PDF files into one PDF file.
Return Values
If the function succeeds, the return value is zero. If the
function fails, the return value is one of following values,
-1: Input wrong command line.
-2: Can't load DLL files correctly.
Example
char
szMergePDFCmd[1024] = {0};
sprintf(szMergePDFCmd,
"-mergepdf C:\\allfiles.txt C:\\out.pdf");
int nRet = Image2PDFOCR_PDFCmd(szMergePDFCmd);
int
WINAPI Image2PDFOCR_CreateSearchablePDF(const
char
*lpTIFOrPDFFile,
const
char
*lpOutputFile, const
char
*lpOptions)
Description
Convert TIFF or PDF file to searchable PDF file directly.
Parameters
lpTIFOrPDFFile
[in] Input TIFF or PDF filename.
lpOutputFile
[out] Output PDF filename.
lpOptions
[in] Options to control the conversion.
Return Values
If the function succeeds, the return value is zero. If the
function fails, the return value is one of following values,
1: Can't find input TIFF or PDF file.
-2: Can't load DLL files correctly.
2: Something is wrong during conversion.
Example
char
szInFile[MAX_PATH] = {0};
char
szOutFile[MAX_PATH] = {0};
GetModulePath(szInFile,"test.tif");
GetModulePath(szOutFile,"test_tif_ocred.pdf");
nRet = Image2PDFOCR_CreateSearchablePDF(szInFile, szOutFile,
"");
printf("Example #1 return
'%d'\n", nRet);
HANDLE WINAPI Image2PDFOCR_GetTextHandle(char
*lpszTIFOrPDFFile,
char
*lpszOptions)
Description
OCR TIFF or PDF file and return a handle.
Parameters
lpszTIFOrPDFFile
[in] Input TIFF or PDF filename.
lpszOptions
[in] Set the options for OCR process, this parameter
does support following options,
-pidpi: Set the DPI resolution for render PDF page and
OCR.
-firstpg: First page to be OCRed.
-lastpg: Last page to be OCRed.
-ocrrect: OCR text in a rectangle, the unit of rectangle is
pixel, the definition of rectangle is [X, Y, Width, Height],
for example, if you wish OCR text in [74, 47, 200, 65] rectangle on
PDF page and you wish render PDF page at 300DPI, you can use
following method to calculate rectangle position on OCRed PDF page,
' "Coordinate on PDF page" * "the value of
-pidpi" / 72 =
"Coordinate on OCRed PDF page"
x = (74 * (300 / 72))
y = (47 * (300 / 72))
w = (200 * (300 / 72))
h = (65 * (300 / 72))
Dim socrrect As String
socrrect = x.ToString & "," & y.ToString & "," &
w.ToString & "," & h.ToString
strOptions = "-pidpi 300 -ocrrect """ & socrrect & """
-firstpg 1 -lastpg 1"
Dim hOCRTextSDK As Integer =
Image2PDFOCR_GetTextHandle(strInFile, strOptions)
Return Values
If the function succeeds, the return value is an open handle
to the OCRed contents. If the function fails, the return value is
NULL.
Example
HANDLE hOCRTextSDK = Image2PDFOCR_GetTextHandle(szInFile,
"-pidpi 300");
int
WINAPI Image2PDFOCR_GetOCRedPageCount(HANDLE hImage2PDFData);
Description
Get the OCRed page account from a handle.
Parameters
hImage2PDFData
[in] This parameter is returned by
Image2PDFOCR_GetTextHandle function.
Return Values
The number of OCRed page account.
int
WINAPI Image2PDFOCR_GetTextInfo(HANDLE hImage2PDFData,
int nPage,
LPBYTE *lpOutTextInfo);
Description
Read text information from a handle.
Parameters
hImage2PDFData
[in] This parameter is returned by
Image2PDFOCR_GetTextHandle function.
nPage
[in] Specify page number to retrieve text information.
lpOutTextInfo
[in] OCRed text contents, it is a pointer to the
STEXTPOS structure.
Return Values
If the function succeeds, the return value is the number of
lpOutTextInfo structure. If the function fails, the return value is
0.
Example
LPBYTE lpChars = NULL;
int
nCharCount = Image2PDFOCR_GetTextInfo(hOCRTextSDK, page, &lpChars);
STEXTPOS *lpTextPos = (STEXTPOS*)lpChars;
for(int
i = 0; i < nCharCount; i++)
{
printf("%d,%d,%d,%d,%s\n",
lpTextPos[i].x, lpTextPos[i].y, lpTextPos[i].width,
lpTextPos[i].height, lpTextPos[i].text);
}
int
WINAPI Image2PDFOCR_SetTextInfo(HANDLE hImage2PDFData,
int nPage,
LPBYTE lpInTextInfo,
int
nCharNum)
Description
Modify OCRed text information.
Parameters
hImage2PDFData
[in] This parameter is returned by
Image2PDFOCR_GetTextHandle function.
nPage
[in] Specify page number to retrieve text information.
lpInTextInfo
[in] Set modified text information, this is a pointer to
the STEXTPOS structure.
nCharNum
[in] the count of lpInTextInfo parameter.
Return Values
If the function succeeds, the return value is the number of
modified STEXTPOS structure. If the function fails, the return value
is 0.
Example
LPBYTE lpChars = NULL;
int
nCharCount = Image2PDFOCR_GetTextInfo(hOCRTextSDK, page, &lpChars);
STEXTPOS *lpTextPos = (STEXTPOS*)lpChars;
for(int
i = 0; i < nCharCount; i++)
{
printf("%d,%d,%d,%d,%s\n",
lpTextPos[i].x, lpTextPos[i].y, lpTextPos[i].width,
lpTextPos[i].height, lpTextPos[i].text);
}
if(lpTextPos)
{
strcpy(lpTextPos->text,
"Image2PDF");
}
Image2PDFOCR_SetTextInfo(hOCRTextSDK, page, lpChars, nCharCount);
void
WINAPI Image2PDFOCR_FreeTextHandle(HANDLE hImage2PDFData)
Description
Free data handle.
Parameters
hImage2PDFData
[in] This parameter is returned by
Image2PDFOCR_GetTextHandle function.
Return Values
None.
int
WINAPI Image2PDFOCR_CreatePDF(HANDLE hImage2PDFData,
char
*lpszOutPDFFile,
char
*lpszOptions)
Description
Create searchable PDF file.
Parameters
hImage2PDFData
[in] This parameter is returned by
Image2PDFOCR_GetTextHandle function.
lpszOutPDFFile
[in] output PDF filename.
lpszOptions
[in] Set the options for PDF creating, this parameter does
support following options,
-pidpi: Set the DPI resolution for render PDF page and
OCR, the value of this parameter should same as -pidpi option
in Image2PDFOCR_GetTextHandle function.
-firstpg: First page to merge the text information.
-lastpg: Last page to merge the text information.
Return Values
If the function succeeds, the return value is zero. If the
function fails, the return value is one of following values,
-1: hImage2PDFData parameter is NULL.
-2: hImage2PDFData parameter is not a valid handle.
-3: Can't load DLL files correctly.
Example
nRet = Image2PDFOCR_CreatePDF(hOCRTextSDK, szOutFile,
"-pidpi 300");
int
WINAPI Image2PDFOCR_CreatePDFInMemory(HANDLE hImage2PDFData,
char
*lpszOptions, LPBYTE *lpDataBuf,
int
*nDataBufLen)
Description
Create searchable PDF file in memory.
Parameters
hImage2PDFData
[in] This parameter is returned by
Image2PDFOCR_GetTextHandle function.
lpszOptions
[in] Set the options for PDF creating, this parameter does
support following options,
-pidpi: Set the DPI resolution for render PDF page and
OCR, the value of this parameter should same as -pidpi option
in Image2PDFOCR_GetTextHandle function.
-firstpg: First page to merge the text information.
-lastpg: Last page to merge the text information.
lpDataBuf
[out] Set a pointer to receive searchable PDF stream in
memory.
nDataBufLen
[out] Set a point to receive the length of lpDataBuf
parameter.
Return Values
If the function succeeds, the return value is zero. If the
function fails, the return value is one of following values,
-1: hImage2PDFData parameter is NULL.
-2: hImage2PDFData parameter is not a valid handle.
-3: Can't load DLL files correctly.
Example
char
szInFile[MAX_PATH] = {0};
char
szOutFile[MAX_PATH] = {0};
GetModulePath(szInFile,"test2.pdf");
GetModulePath(szOutFile,"test2_pdf_ocred-in-memory.pdf");
int
time1 = GetTickCount();
char
*lpOptions = "-pidpi 300";
int
nPageCount = Image2PDFOCR_GetPageCount(szInFile);
printf("'%s' file contains
'%d' pages.\n", szInFile, nPageCount);
HANDLE hOCRTextSDK = Image2PDFOCR_GetTextHandle(szInFile,
lpOptions);
if(hOCRTextSDK)
{
int
nPageCount = Image2PDFOCR_GetOCRedPageCount(hOCRTextSDK);
for(int
page = 0; page < nPageCount; page++)
{
LPBYTE lpChars = NULL;
int
nCharCount = Image2PDFOCR_GetTextInfo(hOCRTextSDK, page, &lpChars);
STEXTPOS *lpTextPos = (STEXTPOS*)lpChars;
for(int
i = 0; i < nCharCount; i++)
{
printf("%d,%d,%d,%d,%s\n",
lpTextPos[i].x, lpTextPos[i].y, lpTextPos[i].width,
lpTextPos[i].height, lpTextPos[i].text);
}
if(lpTextPos)
{
strcpy(lpTextPos->text,
"Image2PDF");
}
printf("Update text info for
page %d\n", page+1);
Image2PDFOCR_SetTextInfo(hOCRTextSDK, page, lpChars, nCharCount);
}
LPBYTE lpDataBuf = NULL;
int
nDataBufLen = 0;
nRet
= Image2PDFOCR_CreatePDFInMemory(hOCRTextSDK, lpOptions, &lpDataBuf,
&nDataBufLen);
if(lpDataBuf
!= NULL && nDataBufLen > 0)
{
FILE *file = fopen(szOutFile,
"wb");
if(file)
{
fwrite(lpDataBuf, 1, nDataBufLen, file);
fclose(file);
}
}
Image2PDFOCR_FreeTextHandle(hOCRTextSDK);
}
int
time2 = GetTickCount();
printf("Example return '%d',
it is spend %dms (%.2fs)...\n", nRet,
time2-time1, (time2-time1)/1000.0);
int
WINAPI Image2PDFOCR_GetPageCount(char
*lpszPDFFile)
Description
Read page count from PDF file.
Parameters
lpszPDFFile
[in] input PDF filename.
Return Values
If the function succeeds, the return value is count of PDF
pages. If the function fails, the return value is zero.
int
WINAPI Image2PDFOCR_GetWordCountOnPage(HANDLE hImage2PDFData,
int
nPage)
Description
Get word count from OCRed page contents.
Parameters
hImage2PDFData
[in] This parameter is returned by
Image2PDFOCR_GetTextHandle function.
nPage
[in] Specify page number to get the word count.
Return Values
If the function succeeds, the return value is count of word
contents. If the function fails, the return value is zero.
Example
int
nPageCount = Image2PDFOCR_GetOCRedPageCount(hOCRTextSDK);
for(int
page = 0; page < nPageCount; page++)
{
int
nWordCount = Image2PDFOCR_GetWordCountOnPage(hOCRTextSDK, page);
for(int
nWordIndex = 0; nWordIndex < nWordCount; nWordIndex++)
{
int
X, Y, Width, Height;
char
szText[500] = {0};
Image2PDFOCR_GetWordInfoByIndex(hOCRTextSDK, page, nWordIndex, &X,
&Y, &Width, &Height, szText);
if(nWordIndex
== 0)
strcpy(szText,
"Image2PDF");
Image2PDFOCR_SetWordInfoByIndex(hOCRTextSDK, page, nWordIndex, X, Y,
Width, Height, szText);
}
}
int
WINAPI Image2PDFOCR_GetWordInfoByIndex(HANDLE hImage2PDFData,
int
nPage, int
nWordIndex, int
*X, int
*Y, int
*Width, int
*Height, char
*lpText)
Description
Get Word information by given index.
Parameters
hImage2PDFData
[in] This parameter is returned by
Image2PDFOCR_GetTextHandle function.
nPage
[in] Specify page number to get the word information.
nWordIndex
[in] Specify word index to get the word information.
X, Y, Width, Height
[out] Receive word's X, Y, Width, Height information.
lpText
[out] Receive word's text information, the buffer of
lpText should longer than 500 characters.
Return Values
If the function succeeds, the return value is 1. If the
function fails, the return value is zero.
int
WINAPI Image2PDFOCR_SetWordInfoByIndex(HANDLE hImage2PDFData,
int
nPage, int
nWordIndex, int
X, int
Y, int
Width, int
Height, char
*lpText)
Description
Set Word information by given index.
Parameters
hImage2PDFData
[in] This parameter is returned by
Image2PDFOCR_GetTextHandle function.
nPage
[in] Specify page number to get the word information.
nWordIndex
[in] Specify word index to get the word information.
X, Y, Width, Height
[in] Set word's X, Y, Width, Height information.
lpText
[in] Set word's text information.
Return Values
If the function succeeds, the return value is 1. If the
function fails, the return value is zero.
|