HWPFDocument读取doc,wps文档（含图片读取）

相关文章推荐

怕考试的木耳 · 软件分享库合集链接汇总推荐_蓝奏云软件分享链 ...· 4 月前 ·

精明的日记本 · 江西省发布第三批非法集资严重失信人名单 ...· 1 年前 ·

奔放的梨子 · Creating Word ...· 1 年前 ·

奔跑的苦咖啡 · 地藏菩萨本愿经讲记（第十三卷）· 2 年前 ·

愤怒的菠萝 · 异兽魔都(林田球创作的系列漫画)_搜狗百科· 2 年前 ·

10 import org.apache.poi.hwpf.HWPFDocument; 11 import org.apache.poi.hwpf.model.PicturesTable; 12 import org.apache.poi.hwpf.usermodel.CharacterRun; 13 import org.apache.poi.hwpf.usermodel.Picture; 14 import org.apache.poi.hwpf.usermodel.Range; 16 /** 17 * Provides access to the pictures both by offset, iteration over the 18 * un-claimed, and peeking forward 19 */ 20 public class PicturesSource {//这个类是poi官网找的 21 private PicturesTable picturesTable; 22 private Set<Picture> output = new HashSet<Picture> (); 23 private Map<Integer, Picture> lookup; 24 private List<Picture> nonU1based; 25 private List<Picture> all; 26 private int pn = 0 ; 28 public PicturesSource(HWPFDocument doc) { 29 picturesTable = doc.getPicturesTable(); 30 all = picturesTable.getAllPictures(); 32 // Build the Offset-Picture lookup map 33 lookup = new HashMap<Integer, Picture> (); 34 for (Picture p : all) { 35 lookup.put(p.getStartOffset(), p); 36 } 38 // Work out which Pictures aren't referenced by 39 // a \u0001 in the main text 40 // These are \u0008 escher floating ones, ones 41 // found outside the normal text, and who 42 // knows what else... 43 nonU1based = new ArrayList<Picture> (); 44 nonU1based.addAll(all); 45 Range r = doc.getRange(); 46 for ( int i = 0; i < r.numCharacterRuns(); i++ ) { 47 CharacterRun cr = r.getCharacterRun(i); 48 if (picturesTable.hasPicture(cr)) { 49 Picture p = getFor(cr); 50 int at = nonU1based.indexOf(p); 51 nonU1based.set(at, null ); 52 } 53 } 54 } 56 private boolean hasPicture(CharacterRun cr) { 57 return picturesTable.hasPicture(cr); 58 } 60 private void recordOutput(Picture picture) { 61 output.add(picture); 62 } 64 private boolean hasOutput(Picture picture) { 65 return output.contains(picture); 66 } 68 private int pictureNumber(Picture picture) { 69 return all.indexOf(picture) + 1 ; 70 } 72 public Picture getFor(CharacterRun cr) { 73 return lookup.get(cr.getPicOffset()); 74 } 76 /** 77 * Return the next unclaimed one, used towards the end 78 */ 79 private Picture nextUnclaimed() { 80 Picture p = null ; 81 while (pn < nonU1based.size()) { 82 p = nonU1based.get(pn); 83 pn++ ; 84 if (p != null ) 85 return p; 86 } 87 return null ; 88 }

2、处理图片和段落文字

 1 package com.poi.test;
 3 import java.io.ByteArrayOutputStream;
 4 import java.io.File;
 5 import java.io.FileInputStream;
 7 import org.apache.poi.hwpf.HWPFDocument;
 8 import org.apache.poi.hwpf.model.PicturesTable;
 9 import org.apache.poi.hwpf.usermodel.CharacterRun;
10 import org.apache.poi.hwpf.usermodel.Paragraph;
11 import org.apache.poi.hwpf.usermodel.Picture;
12 import org.apache.poi.hwpf.usermodel.Range;
14 public class PoiForWord {
15     /**
16      * 使用HWPFDocument解析word文档
17      * wps按doc处理即可
18      */
19     public void parseDocByHWPFDocument(){
20         try(FileInputStream is = new FileInputStream(new File("c:\\a.wps"));HWPFDocument document = new HWPFDocument(is);){
21             ByteArrayOutputStream baos = new ByteArrayOutputStream();//字节流，用来存储图片
22             PicturesSource pictures = new PicturesSource(document);
23             PicturesTable pictureTable = document.getPicturesTable();
25             Range r = document.getRange();//区间
26             for(int i=0;i<r.numParagraphs();i++){
27                 Paragraph p = r.getParagraph(i);//段落
28                 int fontSize = p.getCharacterRun(0).getFontSize();//字号，字号和是否加粗可用来当做标题或者某一关键标识的判断
                   boolean isBold = p.getCharacterRun(0).isBold();//是否加粗
29                 String paragraphText = p.text();//段落文本
31                 //以下代码解析图片,这样获取的图片是在文档流中的，是和文本按顺序解析的，可以很好的解决图片定位问题
32                 for(int j=0;j<p.numCharacterRuns();j++){
33                     CharacterRun cr = p.getCharacterRun(j);//字符
34                     if(pictureTable.hasPicture(cr)){
35                         Picture picture = pictures.getFor(cr);
36                         //如果是在页面显示图片，可转换为base64编码的图片
37                         picture.writeImageContent(baos);//将图片写入字节流
38 //                        String base64Image = "<img src='data:image/png;base64,"+new BASE64Encoder().encode(baos.toByteArray())+"'/>";
39                     }
40                 }
41             }
42         }catch(Exception e){
43             e.printStackTrace();
44         }
45     }
3、处理表格
 5     @Test
 6     public void parseDocTableByHWPFDocument(){
 7         try(FileInputStream is = new FileInputStream(new File("d:\\b.doc"));HWPFDocument document = new HWPFDocument(is);){
 8             Range r = document.getRange();//区间
 9             for(int i=0;i<r.numParagraphs();i++){
10                 Paragraph p = r.getParagraph(i);//段落
11                 String text = p.text();
13                 if(text.indexOf("序号")!=-1){//解析表格需要从表格第一个单元格获取表格，另一种表格的方式是直接获取所有表格，但是无法判断表格在文档中的位置
14                     Table table = r.getTable(p);
16                     int numRows = table.numRows();//获取行数
18                     for(int j=0;j<numRows;j++){
19                         TableRow row = table.getRow(j);
20                         int numCells = row.numCells();//当前行列数
21                         for(int k=0;k<numCells;k++){
22                             TableCell cell = row.getCell(k);
23                             System.out.print(cell.text()+" @ ");
24                         }
25                         System.out.println();
26                     }
27                 }
28             }
29         }catch(Exception e){
30             e.printStackTrace();
31         }
 字符"?"可通过字符串替换或截取来解决
另一种解析的方式，只支持解析文本内容，且无法获取字号和加粗等字体格式
1 WordExtractor extor = new WordExtractor(is);
2             String[] paragraphText = extor.getParagraphText();