/*
* convertSegmentedPagesToPS()
*
* Input: pagedir (input page image directory)
* pagestr (<optional> substring filter on page filenames;
* can be NULL)
* page_numpre (number of characters in page name before number)
* maskdir (input mask image directory)
* maskstr (<optional> substring filter on mask filenames;
* can be NULL)
* mask_numpre (number of characters in mask name before number)
* numpost (number of characters in names after number)
* maxnum (only consider page numbers up to this value)
* textscale (scale of text output relative to pixs)
* imagescale (scale of image output relative to pixs)
* threshold (for binarization; typ. about 190; 0 for default)
* fileout (output ps file)
* Return: 0 if OK, 1 on error
*
* Notes:
* (1) This generates a PS file for all page image and mask files in two
* specified directories and that contain the page numbers as
* specified below. The two directories can be the same, in which
* case the page and mask files are differentiated by the two
* substrings for string matches.
* (2) The page images are taken in lexicographic order.
* Mask images whose numbers match the page images are used to
* segment the page images. Page images without a matching
* mask image are scaled, thresholded and rendered entirely as text.
* (3) Each PS page is generated as a compressed representation of
* the page image, where the part of the image under the mask
* is suitably scaled and compressed as DCT (i.e., jpeg), and
* the remaining part of the page is suitably scaled, thresholded,
* compressed as G4 (i.e., tiff g4), and rendered by painting
* black through the resulting text mask.
* (4) The scaling is typically 2x down for the DCT component
* (@imagescale = 0.5) and 2x up for the G4 component
* (@textscale = 2.0).
* (5) The resolution is automatically set to fit to a
* letter-size (8.5 x 11 inch) page.
* (6) Both the DCT and the G4 encoding are PostScript level 2.
* (7) It is assumed that the page number is contained within
* the basename (the filename without directory or extension).
* @page_numpre is the number of characters in the page basename
* preceding the actual page number; @mask_numpre is likewise for
* the mask basename; @numpost is the number of characters
* following the page number. For example, for mask name
* mask_006.tif, mask_numpre = 5 ("mask_).
* (8) To render a page as is -- that is, with no thresholding
* of any pixels -- use a mask in the mask directory that is
* full size with all pixels set to 1. If the page is 1 bpp,
* it is not necessary to have a mask.
*/
l_int32
convertSegmentedPagesToPS(const char *pagedir,
const char *pagestr,
l_int32 page_numpre,
const char *maskdir,
const char *maskstr,
l_int32 mask_numpre,
l_int32 numpost,
l_int32 maxnum,
l_float32 textscale,
l_float32 imagescale,
l_int32 threshold,
const char *fileout)
{
l_int32 pageno, i, npages;
PIX *pixs, *pixm;
SARRAY *sapage, *samask;
PROCNAME("convertSegmentedPagesToPS");
if (!pagedir)
return ERROR_INT("pagedir not defined", procName, 1);
if (!maskdir)
return ERROR_INT("maskdir not defined", procName, 1);
if (!fileout)
return ERROR_INT("fileout not defined", procName, 1);
if (threshold <= 0) {
L_INFO("setting threshold to 190\n", procName);
threshold = 190;
}
/* Get numbered full pathnames; max size of sarray is maxnum */
sapage = getNumberedPathnamesInDirectory(pagedir, pagestr,
page_numpre, numpost, maxnum);
samask = getNumberedPathnamesInDirectory(maskdir, maskstr,
mask_numpre, numpost, maxnum);
sarrayPadToSameSize(sapage, samask, (char *)"");
if ((npages = sarrayGetCount(sapage)) == 0) {
sarrayDestroy(&sapage);
sarrayDestroy(&samask);
return ERROR_INT("no matching pages found", procName, 1);
}
/* Generate the PS file */
pageno = 1;
for (i = 0; i < npages; i++) {
if ((pixs = pixReadIndexed(sapage, i)) == NULL)
//.........这里部分代码省略.........
//.........这里部分代码省略.........
* prototypes have one or more sets of '(' followed eventually
* by a ')', and end with ';'. But function definitions have
* tokens, followed by '(', more tokens, ')' and then
* immediately a '{'. We would generate a prototype from this
* by adding a ';' to all tokens up to the ')'. So we use
* these special tokens to decide what we are parsing. And
* whenever a function definition is found and the prototype
* extracted, we skip through the rest of the function
* past the corresponding '}'. This token ends a line, and
* is often on a line of its own. But as it turns out,
* the only keyword we need to consider is 'static'.
* (4) Plan 3. Consider the parentheses and braces for various
* declarations. A struct, enum, or union has a pair of
* braces followed by a semicolon. They cannot have parentheses
* before the left brace, but a struct can have lots of parentheses
* within the brace set. A function prototype has no braces.
* A function declaration can have sets of left and right
* parentheses, but these are followed by a left brace.
* So plan 3 looks at the way parentheses and braces are
* organized. Once the beginning of a function definition
* is found, the prototype is extracted and we search for
* the ending right brace.
* (5) To find the ending right brace, it is necessary to do some
* careful parsing. For example, in this file, we have
* left and right braces as characters, and these must not
* be counted. Somewhat more tricky, the file fhmtauto.c
* generates code, and includes a right brace in a string.
* So we must not include braces that are in strings. But how
* do we know if something is inside a string? Keep state,
* starting with not-inside, and every time you hit a double quote
* that is not escaped, toggle the condition. Any brace
* found in the state of being within a string is ignored.
* (6) When a prototype is extracted, it is put in a canonical
* form (i.e., cleaned up). Finally, we check that it is
* not static and save it. (If static, it is ignored).
* (7) The @prestring for unix is NULL; it is included here so that
* you can use Microsoft's declaration for importing or
* exporting to a dll. See environ.h for examples of use.
* Here, we set: @prestring = "LEPT_DLL ". Note in particular
* the space character that will separate 'LEPT_DLL' from
* the standard unix prototype that follows.
*/
char *
parseForProtos(const char *filein,
const char *prestring)
{
char *strdata, *str, *newstr, *parsestr, *secondword;
l_int32 nbytes, start, next, stop, charindex, found;
SARRAY *sa, *saout, *satest;
PROCNAME("parseForProtos");
if (!filein)
return (char *)ERROR_PTR("filein not defined", procName, NULL);
/* Read in the cpp output into memory, one string for each
* line in the file, omitting blank lines. */
strdata = (char *)arrayRead(filein, &nbytes);
sa = sarrayCreateLinesFromString(strdata, 0);
saout = sarrayCreate(0);
next = 0;
while (1) { /* repeat after each non-static prototype is extracted */
searchForProtoSignature(sa, next, &start, &stop, &charindex, &found);
if (!found)
break;
/* fprintf(stderr, " start = %d, stop = %d, charindex = %d\n",
start, stop, charindex); */
str = captureProtoSignature(sa, start, stop, charindex);
/* Make sure it is not static. Note that 'extern' has
* been prepended to the prototype, so the 'static'
* keyword, if it exists, would be the second word. */
satest = sarrayCreateWordsFromString(str);
secondword = sarrayGetString(satest, 1, 0);
if (strcmp(secondword, "static")) { /* not static */
if (prestring) { /* prepend it to the prototype */
newstr = stringJoin(prestring, str);
sarrayAddString(saout, newstr, L_INSERT);
FREE(str);
}
else
sarrayAddString(saout, str, L_INSERT);
}
else
FREE(str);
sarrayDestroy(&satest);
skipToEndOfFunction(sa, stop, charindex, &next);
if (next == -1) break;
}
/* Flatten into a string with newlines between prototypes */
parsestr = sarrayToString(saout, 1);
FREE(strdata);
sarrayDestroy(&sa);
sarrayDestroy(&saout);
return parsestr;
}
/*
* convertFilesToPS()
*
* Input: dirin (input directory)
* substr (<optional> substring filter on filenames; can be NULL)
* res (typ. 300 or 600 ppi)
* fileout (output ps file)
* Return: 0 if OK, 1 on error
*
* Notes:
* (1) This generates a PS file for all image files in a specified
* directory that contain the substr pattern to be matched.
* (2) Each image is written to a separate page in the output PS file.
* (3) All images are written compressed:
* * if tiffg4 --> use ccittg4
* * if jpeg --> use dct
* * all others --> use flate
* If the image is jpeg or tiffg4, we use the existing compressed
* strings for the encoding; otherwise, we read the image into
* a pix and flate-encode the pieces.
* (4) The resolution is often confusing. It is interpreted
* as the resolution of the output display device: "If the
* input image were digitized at 300 ppi, what would it
* look like when displayed at res ppi." So, for example,
* if res = 100 ppi, then the display pixels are 3x larger
* than the 300 ppi pixels, and the image will be rendered
* 3x larger.
* (5) The size of the PostScript file is independent of the resolution,
* because the entire file is encoded. The res parameter just
* tells the PS decomposer how to render the page. Therefore,
* for minimum file size without loss of visual information,
* if the output res is less than 300, you should downscale
* the image to the output resolution before wrapping in PS.
* (6) The "canvas" on which the image is rendered, at the given
* output resolution, is a standard page size (8.5 x 11 in).
*/
l_int32
convertFilesToPS(const char *dirin,
const char *substr,
l_int32 res,
const char *fileout)
{
SARRAY *sa;
PROCNAME("convertFilesToPS");
if (!dirin)
return ERROR_INT("dirin not defined", procName, 1);
if (!fileout)
return ERROR_INT("fileout not defined", procName, 1);
if (res <= 0) {
L_INFO("setting res to 300 ppi", procName);
res = 300;
}
if (res < 10 || res > 4000)
L_WARNING("res is typically in the range 300-600 ppi", procName);
/* Get all filtered and sorted full pathnames. */
sa = getSortedPathnamesInDirectory(dirin, substr, 0, 0);
/* Generate the PS file. */
sarrayConvertFilesToPS(sa, res, fileout);
sarrayDestroy(&sa);
return 0;
}
/*
* convertFilesFittedToPS()
*
* Input: dirin (input directory)
* substr (<optional> substring filter on filenames; can be NULL)
* xpts, ypts (desired size in printer points; use 0 for default)
* fileout (output ps file)
* Return: 0 if OK, 1 on error
*
* Notes:
* (1) This generates a PS file for all files in a specified directory
* that contain the substr pattern to be matched.
* (2) Each image is written to a separate page in the output PS file.
* (3) All images are written compressed:
* * if tiffg4 --> use ccittg4
* * if jpeg --> use dct
* * all others --> use flate
* If the image is jpeg or tiffg4, we use the existing compressed
* strings for the encoding; otherwise, we read the image into
* a pix and flate-encode the pieces.
* (4) The resolution is internally determined such that the images
* are rendered, in at least one direction, at 100% of the given
* size in printer points. Use 0.0 for xpts or ypts to get
* the default value, which is 612.0 or 792.0, rsp.
* (5) The size of the PostScript file is independent of the resolution,
* because the entire file is encoded. The @xpts and @ypts
* parameter tells the PS decomposer how to render the page.
*/
l_int32
convertFilesFittedToPS(const char *dirin,
const char *substr,
l_float32 xpts,
l_float32 ypts,
const char *fileout)
{
SARRAY *sa;
PROCNAME("convertFilesFittedToPS");
if (!dirin)
return ERROR_INT("dirin not defined", procName, 1);
if (!fileout)
return ERROR_INT("fileout not defined", procName, 1);
if (xpts <= 0.0) {
L_INFO("setting xpts to 612.0 ppi", procName);
xpts = 612.0;
}
if (ypts <= 0.0) {
L_INFO("setting ypts to 792.0 ppi", procName);
ypts = 792.0;
}
if (xpts < 100.0 || xpts > 2000.0 || ypts < 100.0 || ypts > 2000.0)
L_WARNING("xpts,ypts are typically in the range 500-800", procName);
/* Get all filtered and sorted full pathnames. */
sa = getSortedPathnamesInDirectory(dirin, substr, 0, 0);
/* Generate the PS file. */
sarrayConvertFilesFittedToPS(sa, xpts, ypts, fileout);
sarrayDestroy(&sa);
return 0;
}
/*!
* getSortedPathnamesInDirectory()
*
* Input: directory name
* substr (<optional> substring filter on filenames; can be NULL)
* firstpage (0-based)
* npages (use 0 for all to the end)
* Return: sarray of sorted pathnames, or NULL on error
*
* Notes:
* (1) If 'substr' is not NULL, only filenames that contain
* the substring can be returned. If 'substr' is NULL,
* none of the filenames are filtered out.
* (2) The files in the directory, after optional filtering by
* the substring, are lexically sorted in increasing order.
* The full pathnames are returned for the requested sequence.
* If no files are found after filtering, returns an empty sarray.
*/
SARRAY *
getSortedPathnamesInDirectory(const char *dirname,
const char *substr,
l_int32 firstpage,
l_int32 npages)
{
char *fname, *fullname;
l_int32 i, nfiles, lastpage;
SARRAY *sa, *safiles, *saout;
PROCNAME("getSortedPathnamesInDirectory");
if (!dirname)
return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
if ((sa = getFilenamesInDirectory(dirname)) == NULL)
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
safiles = sarraySelectBySubstring(sa, substr);
sarrayDestroy(&sa);
nfiles = sarrayGetCount(safiles);
if (nfiles == 0) {
L_WARNING("no files found", procName);
return safiles;
}
sarraySort(safiles, safiles, L_SORT_INCREASING);
firstpage = L_MIN(L_MAX(firstpage, 0), nfiles - 1);
if (npages == 0)
npages = nfiles - firstpage;
lastpage = L_MIN(firstpage + npages - 1, nfiles - 1);
saout = sarrayCreate(lastpage - firstpage + 1);
for (i = firstpage; i <= lastpage; i++) {
fname = sarrayGetString(safiles, i, L_NOCOPY);
fullname = genPathname(dirname, fname);
sarrayAddString(saout, fullname, L_INSERT);
}
sarrayDestroy(&safiles);
return saout;
}
/*!
* \brief l_getIndexFromFile()
*
* \param[in] filename
* \param[out] pindex found index
* \return 0 if found, 1 on error.
*/
static l_int32
l_getIndexFromFile(const char *filename,
l_int32 *pindex)
{
char buf[256];
char *word;
FILE *fp;
l_int32 notfound, format;
SARRAY *sa;
PROCNAME("l_getIndexFromFile");
if (!pindex)
return ERROR_INT("&index not defined", procName, 1);
*pindex = 0;
if (!filename)
return ERROR_INT("filename not defined", procName, 1);
/* Open the stream, read lines until you find one with more
* than a newline, and grab the first word. */
if ((fp = fopenReadStream(filename)) == NULL)
return ERROR_INT("stream not opened", procName, 1);
do {
if ((fgets(buf, sizeof(buf), fp)) == NULL) {
fclose(fp);
return ERROR_INT("fgets read fail", procName, 1);
}
} while (buf[0] == '\n');
fclose(fp);
sa = sarrayCreateWordsFromString(buf);
word = sarrayGetString(sa, 0, L_NOCOPY);
/* Find the index associated with the word. If it is not
* found, test to see if the file is a compressed pix. */
notfound = l_getIndexFromStructname(word, pindex);
sarrayDestroy(&sa);
if (notfound) { /* maybe a Pix */
if (findFileFormat(filename, &format) == 0) {
l_getIndexFromStructname("Pix", pindex);
} else {
return ERROR_INT("no file type identified", procName, 1);
}
}
return 0;
}
/*!
* \brief strcodeCreateFromFile()
*
* \param[in] filein containing filenames of serialized data
* \param[in] fileno integer that labels the two output files
* \param[in] outdir [optional] if null, files are made in /tmp/lept/auto
* \return 0 if OK, 1 on error
*
* <pre>
* Notes:
* (1) The %filein has one filename on each line.
* Comment lines begin with "#".
* (2) The output is 2 files:
* autogen.\<fileno\>.c
* autogen.\<fileno\>.h
* </pre>
*/
l_int32
strcodeCreateFromFile(const char *filein,
l_int32 fileno,
const char *outdir)
{
char *fname;
const char *type;
l_uint8 *data;
size_t nbytes;
l_int32 i, n, index;
SARRAY *sa;
L_STRCODE *strcode;
PROCNAME("strcodeCreateFromFile");
if (!filein)
return ERROR_INT("filein not defined", procName, 1);
if ((data = l_binaryRead(filein, &nbytes)) == NULL)
return ERROR_INT("data not read from file", procName, 1);
sa = sarrayCreateLinesFromString((char *)data, 0);
LEPT_FREE(data);
if (!sa)
return ERROR_INT("sa not made", procName, 1);
if ((n = sarrayGetCount(sa)) == 0) {
sarrayDestroy(&sa);
return ERROR_INT("no filenames in the file", procName, 1);
}
strcode = strcodeCreate(fileno);
for (i = 0; i < n; i++) {
fname = sarrayGetString(sa, i, L_NOCOPY);
if (fname[0] == '#') continue;
if (l_getIndexFromFile(fname, &index)) {
L_ERROR("File %s has no recognizable type\n", procName, fname);
} else {
type = l_assoc[index].type;
L_INFO("File %s is type %s\n", procName, fname, type);
strcodeGenerate(strcode, fname, type);
}
}
strcodeFinalize(&strcode, outdir);
return 0;
}
/*!
* pixaReadFiles()
*
* Input: dirname
* substr (<optional> substring filter on filenames; can be null)
* Return: pixa, or null on error
*
* Notes:
* (1) @dirname is the full path for the directory.
* (2) @substr is the part of the file name (excluding
* the directory) that is to be matched. All matching
* filenames are read into the Pixa. If substr is NULL,
* all filenames are read into the Pixa.
*/
PIXA *
pixaReadFiles(const char *dirname,
const char *substr)
{
PIXA *pixa;
SARRAY *sa;
PROCNAME("pixaReadFiles");
if (!dirname)
return (PIXA *)ERROR_PTR("dirname not defined", procName, NULL);
if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
return (PIXA *)ERROR_PTR("sa not made", procName, NULL);
pixa = pixaReadFilesSA(sa);
sarrayDestroy(&sa);
return pixa;
}
/*!
* \brief sarrayUnionByAset()
*
* \param[in] sa1, sa2
* \return sad with the union of the string set, or NULL on error
*
* <pre>
* Notes:
* (1) Duplicates are removed from the concatenation of the two arrays.
* (2) The key for each string is a 64-bit hash.
* (2) Algorithm: Concatenate the two sarrays. Then build a set,
* using hashed strings as keys. As the set is built, first do
* a find; if not found, add the key to the set and add the string
* to the output sarray. This is O(nlogn).
* </pre>
*/
SARRAY *
sarrayUnionByAset(SARRAY *sa1,
SARRAY *sa2)
{
SARRAY *sa3, *sad;
PROCNAME("sarrayUnionByAset");
if (!sa1)
return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL);
if (!sa2)
return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL);
/* Join */
sa3 = sarrayCopy(sa1);
sarrayJoin(sa3, sa2);
/* Eliminate duplicates */
sad = sarrayRemoveDupsByAset(sa3);
sarrayDestroy(&sa3);
return sad;
}
/*!
* numaCreateFromString()
*
* Input: string (of comma-separated numbers)
* Return: na, or null on error
*
* Notes:
* (1) The numbers can be ints or floats; they will be interpreted
* and stored as floats. To use them as integers (e.g., for
* indexing into arrays), use numaGetIValue(...).
*/
NUMA *
numaCreateFromString(const char *str)
{
char *substr;
l_int32 i, n, nerrors;
l_float32 val;
NUMA *na;
SARRAY *sa;
PROCNAME("numaCreateFromString");
if (!str || (strlen(str) == 0))
return (NUMA *)ERROR_PTR("str not defined or empty", procName, NULL);
sa = sarrayCreate(0);
sarraySplitString(sa, str, ",");
n = sarrayGetCount(sa);
na = numaCreate(n);
nerrors = 0;
for (i = 0; i < n; i++) {
substr = sarrayGetString(sa, i, L_NOCOPY);
if (sscanf(substr, "%f", &val) != 1) {
L_ERROR("substr %d not float\n", procName, i);
nerrors++;
} else {
numaAddNumber(na, val);
}
}
sarrayDestroy(&sa);
if (nerrors > 0) {
numaDestroy(&na);
return (NUMA *)ERROR_PTR("non-floats in string", procName, NULL);
}
return na;
}
请发表评论