Take the 2-minute tour ×
Stack Overflow is a question and answer site for professional and enthusiast programmers. It's 100% free, no registration required.

I am converting doc file to pdf using following java code by using Apache poi library :

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.OutputStream;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import com.lowagie.text.Document;
import com.lowagie.text.Paragraph;
import com.lowagie.text.pdf.PdfWriter;

public class TestDoc {

    /**
     * @param args
     */
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        POIFSFileSystem fs = null;
        Document document = new Document();
        try {
            System.out.println("Starting the test");

            //D:\vijay\doctopdf
            fs = new POIFSFileSystem(new FileInputStream("D:/vijay/doctopdf/test.doc"));

            HWPFDocument doc = new HWPFDocument(fs);
            WordExtractor we = new WordExtractor(doc);

            OutputStream file = new FileOutputStream(new File("D:/vijay/doctopdf/test.pdf"));

            PdfWriter writer = PdfWriter.getInstance(document, file);

            Range range = doc.getRange();
            document.open();
            writer.setPageEmpty(true);
            document.newPage();
            writer.setPageEmpty(true);

            String[] paragraphs = we.getParagraphText();
            for (int i = 0; i < paragraphs.length; i++) {

                org.apache.poi.hwpf.usermodel.Paragraph pr = range
                        .getParagraph(i);
                // CharacterRun run = pr.getCharacterRun(i);
                // run.setBold(true);
                // run.setCapitalized(true);
                // run.setItalic(true);
                paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
                System.out.println("Length:" + paragraphs[i].length());
                System.out.println("Paragraph" + i + ": "
                        + paragraphs[i].toString());

                // add the paragraph to the document
                document.add(new Paragraph(paragraphs[i]));
            }

            System.out.println("Document testing completed");
        } catch (Exception e) {
            System.out.println("Exception during test");
            e.printStackTrace();
        } finally {
            // close the document
            document.close();
        }
    }

}

above code run successfully(convert only text in pdf). but when doc contain tables or images or etc.. it will not come in resulting pdf. Any one know how can i get doc as pdf with full accuracy and formatting.

share|improve this question
add comment

1 Answer

You can use WordExtractor from Apache Tika Parser

share|improve this answer
add comment

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Not the answer you're looking for? Browse other questions tagged or ask your own question.