public class PDFText2HTML extends PDFTextStripper
charactersByArticle, document, output, outputEncoding, systemLineSeparator
Constructor and Description |
---|
PDFText2HTML(String encoding)
Constructor.
|
Modifier and Type | Method and Description |
---|---|
protected void |
endArticle()
Write out the article separator.
|
void |
endDocument(PDDocument pdf)
This method is available for subclasses of this class.
|
protected String |
getTitle()
This method will attempt to guess the title of the document using
either the document properties or the first lines of text.
|
protected void |
startArticle(boolean isltr)
Write out the article separator (div tag) with proper text direction
information.
|
protected void |
writeHeader()
Write the header to the output document.
|
protected void |
writePage()
This will print the text of the processed page to "output".
|
protected void |
writeParagraphEnd()
Writes the paragraph end "" to the output.
|
protected void |
writeString(String chars)
Write a string to the output stream and escape some HTML characters.
|
protected void |
writeString(String text,
List<TextPosition> textPositions)
Write a string to the output stream, maintain font state, and escape some HTML characters.
|
endPage, getAddMoreFormatting, getArticleEnd, getArticleStart, getAverageCharTolerance, getCharactersByArticle, getCurrentPageNo, getDropThreshold, getEndBookmark, getEndPage, getIndentThreshold, getLineSeparator, getListItemPatterns, getOutput, getPageEnd, getPageSeparator, getPageStart, getParagraphEnd, getParagraphStart, getSeparateByBeads, getSortByPosition, getSpacingTolerance, getStartBookmark, getStartPage, getSuppressDuplicateOverlappingText, getText, getText, getWordSeparator, handleLineSeparation, inspectFontEncoding, isParagraphSeparation, matchListItemPattern, matchPattern, processPage, processPages, processTextPosition, resetEngine, setAddMoreFormatting, setArticleEnd, setArticleStart, setAverageCharTolerance, setDropThreshold, setEndBookmark, setEndPage, setIndentThreshold, setLineSeparator, setListItemPatterns, setPageEnd, setPageSeparator, setPageStart, setParagraphEnd, setParagraphStart, setShouldSeparateByBeads, setSortByPosition, setSpacingTolerance, setStartBookmark, setStartPage, setSuppressDuplicateOverlappingText, setWordSeparator, startArticle, startDocument, startPage, writeCharacters, writeLineSeparator, writePageEnd, writePageSeperator, writePageStart, writeParagraphSeparator, writeParagraphStart, writeText, writeText, writeWordSeparator
getColorSpaces, getCurrentPage, getFonts, getGraphicsStack, getGraphicsState, getGraphicsStates, getResources, getTextLineMatrix, getTextMatrix, getTotalCharCnt, getValidCharCnt, getXObjects, isForceParsing, processEncodedText, processOperator, processOperator, processStream, processSubStream, registerOperatorProcessor, setColorSpaces, setFonts, setForceParsing, setGraphicsStack, setGraphicsState, setGraphicsStates, setTextLineMatrix, setTextMatrix
public PDFText2HTML(String encoding) throws IOException
encoding
- The encoding to be usedIOException
- If there is an error during initialization.protected void writeHeader() throws IOException
IOException
- If there is a problem writing out the header to the document.protected void writePage() throws IOException
writePage
in class PDFTextStripper
IOException
- If there is an error writing the text.public void endDocument(PDDocument pdf) throws IOException
endDocument
in class PDFTextStripper
pdf
- The PDF document that is being processed.IOException
- If an IO error occurs.protected String getTitle()
protected void startArticle(boolean isltr) throws IOException
startArticle
in class PDFTextStripper
isltr
- true if direction of text is left to rightIOException
- If there is an error writing to the stream.protected void endArticle() throws IOException
endArticle
in class PDFTextStripper
IOException
- If there is an error writing to the stream.protected void writeString(String text, List<TextPosition> textPositions) throws IOException
writeString
in class PDFTextStripper
text
- The text to write to the stream.textPositions
- the corresponding text positionsIOException
- If there is an error writing to the stream.protected void writeString(String chars) throws IOException
writeString
in class PDFTextStripper
chars
- String to be written to the streamIOException
- If there is an error writing to the stream.protected void writeParagraphEnd() throws IOException
writeParagraphEnd
in class PDFTextStripper
IOException
- if something went wrongCopyright © 2002-2015 The Apache Software Foundation. All Rights Reserved.