Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
196 views
in Technique[技术] by (71.8m points)

Extract images using iTextSharp

I have been using this code with great success to pull out the first image found in each page of a PDF. However, it is now not working with some new PDFs for an uknown reason. I have used other tools (Datalogics, etc) that do pull out the images fine with these new PDFs. However, I do not want to buy Datalogics or any tool if I can use iTextSharp. Can anybody tell me why this code is not finding the images in the PDF?

Knowns: my PDFs only have 1 image per page and nothing else.

using iTextSharp.text;
using iTextSharp.text.pdf;
...
public static void ExtractImagesFromPDF(string sourcePdf, string outputPath)
{
    // NOTE:  This will only get the first image it finds per page.
    PdfReader pdf = new PdfReader(sourcePdf);
    RandomAccessFileOrArray raf = new iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf);

    try
    {
        for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
        {
            PdfDictionary pg = pdf.GetPageN(pageNumber);
            PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));

            PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
            if (xobj != null)
            {
                foreach (PdfName name in xobj.Keys)
                {
                    PdfObject obj = xobj.Get(name);
                    if (obj.IsIndirect())
                    {
                        PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
                        PdfName type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
                        if (PdfName.IMAGE.Equals(type))
                        {
                            int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
                            PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
                            PdfStream pdfStrem = (PdfStream)pdfObj;
                            byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
                            if ((bytes != null))
                            {
                                using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes))
                                {
                                    memStream.Position = 0;
                                    System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
                                    // must save the file while stream is open.
                                    if (!Directory.Exists(outputPath))
                                        Directory.CreateDirectory(outputPath);

                                    string path = Path.Combine(outputPath, String.Format(@"{0}.jpg", pageNumber));
                                    System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
                                    parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
                                    System.Drawing.Imaging.ImageCodecInfo jpegEncoder = Utilities.GetImageEncoder("JPEG");
                                    img.Save(path, jpegEncoder, parms);
                                    break;
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    catch
    {
        throw;
    }
    finally
    {
        pdf.Close();
        raf.Close();
    }
}
Question&Answers:os

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Answer

0 votes
by (71.8m points)

I found that my problem was that I was not recursively searching inside of forms and groups for images. Basically, the original code would only find images that were embedded at the root of the pdf document. Here is the revised method plus a new method (FindImageInPDFDictionary) that recursively searches for images in the page. NOTE: the flaws of only supporting JPEG and non-compressed images still applies. See R Ubben's code for options to fix those flaws. HTH someone.

    public static void ExtractImagesFromPDF(string sourcePdf, string outputPath)
    {
        // NOTE:  This will only get the first image it finds per page.
        PdfReader pdf = new PdfReader(sourcePdf);
        RandomAccessFileOrArray raf = new iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf);

        try
        {
            for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
            {
                PdfDictionary pg = pdf.GetPageN(pageNumber);

                // recursively search pages, forms and groups for images.
                PdfObject obj = FindImageInPDFDictionary(pg);
                if (obj != null)
                {

                    int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
                    PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
                    PdfStream pdfStrem = (PdfStream)pdfObj;
                    byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
                    if ((bytes != null))
                    {
                        using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes))
                        {
                            memStream.Position = 0;
                            System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
                            // must save the file while stream is open.
                            if (!Directory.Exists(outputPath))
                                Directory.CreateDirectory(outputPath);

                            string path = Path.Combine(outputPath, String.Format(@"{0}.jpg", pageNumber));
                            System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
                            parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
                            System.Drawing.Imaging.ImageCodecInfo jpegEncoder = Utilities.GetImageEncoder("JPEG");
                            img.Save(path, jpegEncoder, parms);
                        }
                    }
                }
            }
        }
        catch
        {
            throw;
        }
        finally
        {
            pdf.Close();
            raf.Close();
        }


    }

     private static PdfObject FindImageInPDFDictionary(PdfDictionary pg)
    {
        PdfDictionary res =
            (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));


        PdfDictionary xobj =
          (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
        if (xobj != null)
        {
            foreach (PdfName name in xobj.Keys)
            {

                PdfObject obj = xobj.Get(name);
                if (obj.IsIndirect())
                {
                    PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);

                    PdfName type =
                      (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));

                    //image at the root of the pdf
                    if (PdfName.IMAGE.Equals(type))
                    {
                        return obj;
                    }// image inside a form
                    else if (PdfName.FORM.Equals(type))
                    {
                        return FindImageInPDFDictionary(tg);
                    } //image inside a group
                    else if (PdfName.GROUP.Equals(type))
                    {
                        return FindImageInPDFDictionary(tg);
                    }

                }
            }
        }

        return null;

    }

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

...