|
com.aspose.pdf.kit
Class PdfExtractor
java.lang.Object
com.aspose.pdf.kit.PdfExtractor
public class PdfExtractor - extends java.lang.Object
Represents a class to extract images and text from pdf file.
NOTE: This is Beta version of PdfExtractor. Some features may not be supported well and we may be not able to fix them in short time.
|
Constructor Summary |
PdfExtractor()
The constructor of the PdfExtractor object. |
|
Method Summary |
void |
bindPdf(java.io.InputStream inputStream)
Binds a Pdf Stream for extract. |
void |
bindPdf(java.lang.String inputFile)
Binds a Pdf file for extract. |
void |
extractAttachment()
Extracts attachments from a Pdf document. |
void |
extractImage()
Extracts images from a Pdf document. |
void |
extractText()
Extracts text from a Pdf document. |
void |
extractTextInRectangle(java.awt.Rectangle rec)
Extracts the text content of the page within the rectangle. |
void |
extractTextInRectangle(java.awt.Rectangle rec,
ExtractTextMode extMode)
Extracts the text content of the page within the rectangle. |
void |
getAllRectangleText(java.io.OutputStream outputStream)
Saves all texts within the rectangle to stream. |
void |
getAllRectangleText(java.lang.String outputFile)
Saves all texts within the rectangle to file. |
java.io.ByteArrayOutputStream[] |
getAttachment()
Saves all the attachment file to streams. |
void |
getAttachment(java.lang.String outputPath)
Saves all the attachment file to outputPath. |
java.util.ArrayList |
getAttachNames()
Gets all the attachment file's filename. |
int |
getEndPage()
Gets endPage value. |
void |
getNextImage(java.io.OutputStream outputStream)
Saves image to stream with default image format - Jpeg. |
void |
getNextImage(java.io.OutputStream outputStream,
ImageType imageTypeName)
Saves image to stream with the givin image format. |
void |
getNextImage(java.io.OutputStream outputStream,
java.lang.String imageTypeName)
Saves image to stream with the givin image format name. |
void |
getNextImage(java.lang.String outputFile)
Saves image to file with default image format - Jpeg. |
void |
getNextImage(java.lang.String outputFile,
ImageType imageType)
Saves image to file with the givin image format. |
void |
getNextImage(java.lang.String outputFile,
java.lang.String imageTypeName)
Saves image to file with the givin image format name. |
java.lang.String |
getPassword()
Gets password. |
int |
getStartPage()
Gets startPage value. |
void |
getText(java.io.OutputStream outputStream)
Saves text to stream. |
void |
getText(java.lang.String outputFile)
Saves text to file. |
int |
getWordCount()
Returns the word count of the pdf document. |
boolean |
hasNextImage()
Judges if it can get more images or not. |
void |
setEndPage(int endPage)
Sets endPage value. |
void |
setPassword(java.lang.String password)
Sets password, use this password to decrypt the pdf file. |
void |
setStartPage(int startPage)
Sets startPage value. |
| Methods inherited from class java.lang.Object |
equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
PdfExtractor
public PdfExtractor()
- The constructor of the PdfExtractor object.
setStartPage
public void setStartPage(int startPage)
- Sets startPage value.
- Parameters:
startPage - start position which you want to extract of the pdf file.
getStartPage
public int getStartPage()
- Gets startPage value.
- Returns:
- start position which you want to extract of the pdf file.
setEndPage
public void setEndPage(int endPage)
- Sets endPage value.
- Parameters:
endPage - end position which you want to extract of the pdf file.
getEndPage
public int getEndPage()
- Gets endPage value.
- Returns:
- end position which you want to extract of the pdf file.
setPassword
public void setPassword(java.lang.String password)
- Sets password, use this password to decrypt the pdf file.
- Parameters:
password - the input pdf file's password.
getPassword
public java.lang.String getPassword()
- Gets password.
- Returns:
- the input pdf file's password we have set.
bindPdf
public void bindPdf(java.lang.String inputFile)
throws java.io.FileNotFoundException
- Binds a Pdf file for extract.
- Parameters:
inputFile - The pdf file to be extracted.
- Throws:
java.io.FileNotFoundException
bindPdf
public void bindPdf(java.io.InputStream inputStream)
- Binds a Pdf Stream for extract.
- Parameters:
inputStream - The pdf Stream to be extracted.
- Throws:
java.io.FileNotFoundException
extractImage
public void extractImage()
throws java.lang.Exception
- Extracts images from a Pdf document.
[SampleCode]
PdfExtractor extractor = new PdfExtractor();
extractor.bindPdf(path + "Image.pdf");
extractor.extractImage();
String suffix = ".jpg";
int imageCount = 1;
while (extractor.hasNextImage()) {
extractor.getNextImage(path + imageCount + suffix);
imageCount++;
}
- Throws:
java.lang.Exception
getNextImage
public void getNextImage(java.lang.String outputFile)
throws java.lang.Exception
- Saves image to file with default image format - Jpeg.
- Parameters:
outputFile - The file path and name to save the image.
- Throws:
java.lang.Exception
getNextImage
public void getNextImage(java.lang.String outputFile,
java.lang.String imageTypeName)
throws java.lang.Exception
- Saves image to file with the givin image format name.
Use this method before 2006/10/20. Aspose has upgraded to a new method getNextImage(String, ImageType).
Please use new method after that date.
- Parameters:
outputFile - The file path and name to save the imageimageTypeName - Image format name of the extracted image
- Throws:
java.lang.Exception- See Also:
getNextImage(String, ImageType)
getNextImage
public void getNextImage(java.lang.String outputFile,
ImageType imageType)
throws java.lang.Exception
- Saves image to file with the givin image format.
- Parameters:
outputFile - The file path and name to save the image.imageType - Image format of the extracted image.
- Throws:
java.lang.Exception- See Also:
getNextImage(OutputStream, ImageType)
getNextImage
public void getNextImage(java.io.OutputStream outputStream)
throws java.lang.Exception
- Saves image to stream with default image format - Jpeg.
- Parameters:
outputStream - The stream to save the image.
- Throws:
java.lang.Exception- See Also:
getNextImage(OutputStream, ImageType)
getNextImage
public void getNextImage(java.io.OutputStream outputStream,
java.lang.String imageTypeName)
throws java.lang.Exception
- Saves image to stream with the givin image format name.
Use this method before 2006/10/20. Aspose has upgraded to a new method getNextImage(OutputStream, ImageType).
Please use new method after that date.
[SampleCode]
//extract image with the givin image format name(PNG)
PdfExtractor extractor = new PdfExtractor();
extractor.bindPdf(path + "Image.pdf");
extractor.extractImage();
String suffix = ".png";
int imageCount = 1;
while (extractor.hasNextImage()) {
extractor.getNextImage(path + imageCount + suffix, "PNG");
imageCount++;
}
- Parameters:
outputStream - The stream to save the image.imageTypeName - Image format name of the extracted image.
- Throws:
java.lang.Exception- See Also:
getNextImage(OutputStream, ImageType)
getNextImage
public void getNextImage(java.io.OutputStream outputStream,
ImageType imageTypeName)
throws java.lang.Exception
- Saves image to stream with the givin image format.
[SampleCode]
//extract image with the givin image format(PNG)
PdfExtractor extractor = new PdfExtractor();
extractor.bindPdf(path + "Image.pdf");
extractor.extractImage();
String suffix = ".png";
int imageCount = 1;
while (extractor.hasNextImage()) {
extractor.getNextImage(path + imageCount + suffix, ImageType.Png);
imageCount++;
}
- Parameters:
outputStream - The stream to save the image.imageTypeName - Image format name of the extracted image.
- Throws:
java.lang.Exception
hasNextImage
public boolean hasNextImage()
- Judges if it can get more images or not.
True is can and false is can't.
- Returns:
- can get more images or not.
extractText
public void extractText()
throws java.lang.Exception
- Extracts text from a Pdf document.
[SampleCode]
PdfExtractor extractor = new PdfExtractor();
extractor.bindPdf(path + "Text.pdf");
extractor.extractText();
extractor.getText(path + "text.txt");
- Throws:
java.lang.Exception
getText
public void getText(java.lang.String outputFile)
throws java.lang.Exception
- Saves text to file.
- Parameters:
outputFile - The file path and name to save the text.
- Throws:
java.lang.Exception
getText
public void getText(java.io.OutputStream outputStream)
throws java.lang.Exception
- Saves text to stream.
- Parameters:
outputStream - The stream to save the text.
- Throws:
java.lang.Exception
getWordCount
public int getWordCount()
- Returns the word count of the pdf document.
- Returns:
- the word count.
[SampleCode]
PdfExtractor extractor = new PdfExtractor();
extractor.bindPdf(path + "Text.pdf");
extractor.extractText();
int wordCount = extractor.getWordCount();
extractTextInRectangle
public void extractTextInRectangle(java.awt.Rectangle rec,
ExtractTextMode extMode)
throws PdfViewerException,
java.lang.Exception
- Extracts the text content of the page within the rectangle.
The rectangle coordinate origin is the top left corner in pdf files, and its coordinate is (0,0).
- Parameters:
rec - java.awt.Rectangle the rectangle which extracted the texts.
The coordinate origin is (0,0) which is the pdf file top left point.
The rec.width is the extraction text width and the rec.height is the extraction text height.extMode - ExtractTextMode the extract text mode.
- Throws:
PdfViewerException
java.lang.Exception
extractTextInRectangle
public void extractTextInRectangle(java.awt.Rectangle rec)
throws PdfViewerException,
java.lang.Exception
- Extracts the text content of the page within the rectangle.
The rectangle coordinate origin is the top left corner in pdf files, and its coordinate is (0,0).
Invoking this method to extract text uses the default ExtractTextMode.PureTextWithWidth mode.
- Parameters:
rec - java.awt.Rectangle the rectangle which extracted the texts.
The coordinate origin is (0,0) which is the pdf file top left point.
The rec.width is the extraction text width and the rec.height is the extraction text height.
- Throws:
PdfViewerException
java.lang.Exception
getAllRectangleText
public void getAllRectangleText(java.lang.String outputFile)
throws java.lang.Exception
- Saves all texts within the rectangle to file.
- Parameters:
outputFile - The file path and name to save the texts.
- Throws:
java.lang.Exception
getAllRectangleText
public void getAllRectangleText(java.io.OutputStream outputStream)
throws java.lang.Exception
- Saves all texts within the rectangle to stream.
- Parameters:
outputStream - The stream to save the texts.
- Throws:
java.lang.Exception
extractAttachment
public void extractAttachment()
throws java.lang.Exception
- Extracts attachments from a Pdf document.
- Throws:
java.lang.Exception
getAttachment
public void getAttachment(java.lang.String outputPath)
throws java.io.IOException
- Saves all the attachment file to outputPath.
[SampleCode]
PdfExtractor extractor = new PdfExtractor();
extractor.bindPdf(path + "Attach.pdf");
extractor.extractAttachment();
extractor.getAttachment(path);
- Parameters:
outputPath - The path to save the attachment.
- Throws:
java.io.IOException
getAttachment
public java.io.ByteArrayOutputStream[] getAttachment()
throws java.io.IOException
- Saves all the attachment file to streams.
[SampleCode]
PdfExtractor extractor = new PdfExtractor();
extractor.bindPdf(path + "Attach.pdf");
extractor.extractAttachment();
ArrayList names = extractor.getAttachNames();
ByteArrayOutputStream[] tempStreams = extractor.getAttachment();
for (int i=0; i
- Returns:
- The stream array of the attachment file in the pdf document.
- Throws:
java.io.IOException
getAttachNames
public java.util.ArrayList getAttachNames()
- Gets all the attachment file's filename.
- Returns:
- attachment's file names.
|