本文使用POI来实现功能:
导入依赖
导入POI主要的依赖:
<!-- 用于对文件操作,例如写操作 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.15</version>
</dependency>
<!-- .docx解析依赖 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.15</version>
</dependency>
<!-- .doc解析依赖 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.15</version>
</dependency>
<!-- .pdf解析依赖 -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.4</version>
</dependency>
<!-- 单元测试依赖,也可以自己写一个测试类,这个无关紧要 -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
需要保持上面的依赖的版本的一致性。不然会报各种莫名其妙的错!
工具类
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
* @Author: 勇哥
* @Date: 2021/7/8 14:25
public class FileExtract {
public static String fileExtractText(String filePath) throws Exception {
String text;
if (filePath == null || "".equals(filePath)) {
return "";
if (filePath.toLowerCase().endsWith("doc")) {
InputStream file = new FileInputStream(new File(filePath));
WordExtractor wordExtractor = new WordExtractor(file);
text = wordExtractor.getText();
file.close();
wordExtractor.close();
} else if (filePath.toLowerCase().endsWith("docx")) {
OPCPackage opcPackage = POIXMLDocument.openPackage(filePath);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
text = extractor.getText();
opcPackage.close();
extractor.close();
} else if (filePath.toLowerCase().endsWith("pdf")) {
PDDocument pdDocument;
InputStream file = new FileInputStream(new File(filePath));
pdDocument = PDDocument.load(file);
PDFTextStripper stripper = new PDFTextStripper();
text = stripper.getText(pdDocument);
file.close();
pdDocument.close();
} else {
return "error file";
return text;
}
测试
public static void main(String[] args) throws XmlException, OpenXML4JException, IOException {
// String filePath = "F:\\extract\\20181226163749u1xcw.pdf";
String filePath = "F:\\extract\\12.docx";
try {
String s = FileExtract.fileExtractText(filePath);
System.out.println(s);
} catch (Exception e) {
e.printStackTrace();
}
结果分析:
通过工具扫描出来的PDF不能提取出文字。