解析word文档段落以及表格

word文档解析有点麻烦,其中分了.docx和.doc的模版,方便兼容。

1.获取段落内容:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
public JsonResult readWordTemporaryExit(String filePath) throws Exception {
JsonResult jsonResult = new JsonResult();
PageData pd = new PageData();
try {
FileInputStream in = new FileInputStream(filePath);
if(filePath.toLowerCase().endsWith("docx")){
XWPFDocument document = new XWPFDocument(in);
List<IBodyElement> elements = document.getBodyElements();
//获取表格
JsonResult word44 = word44(filePath);
if(!word44.isSuccess()){
return word44;
}

pd = (PageData) word44.getObj();

String p1 = StringUtils.deleteWhitespace(getParagraphText((XWPFParagraph) elements.get(3)));
pd.put("carryingPeople", p1.substring(p1.indexOf(":") + 1, p1.length()));

String p2 = StringUtils.deleteWhitespace(getParagraphText((XWPFParagraph) elements.get(4))).replace(" ", "");
String a = p2.substring(p2.indexOf(":") + 1, p2.length());
String b = a.replaceAll("年", "-");
String c = b.replaceAll("月", "-");
String d = c.replaceAll("日", "");
pd.put("appraisalApplyTime",d);

String p3 = StringUtils.deleteWhitespace(getParagraphText((XWPFParagraph) elements.get(7))).replace(" ", "");
pd.put("auditors",p3);

}else{
HWPFDocument document = new HWPFDocument(in);
//获取表格和图片
JsonResult word44 = word44(filePath);
if(!word44.isSuccess()){
return word44;
}
pd = (PageData) word44.getObj();
// 获取word中的所有段落与表格
Range range = document.getRange();

Paragraph p = range.getParagraph(range.numParagraphs()-15);
String carryingPeople = StringUtils.deleteWhitespace(p.text());
pd.put("carryingPeople",carryingPeople.substring(carryingPeople.indexOf(":")+1,carryingPeople.length()));

Paragraph p2 = range.getParagraph(range.numParagraphs()-14);
String appraisalApplyTime = StringUtils.deleteWhitespace(p2.text());
String a = appraisalApplyTime.substring(appraisalApplyTime.indexOf(":")+1,appraisalApplyTime.length());
String b = a.replaceAll("年", "-");
String c = b.replaceAll("月", "-");
String d = c.replaceAll("日", "");
pd.put("appraisalApplyTime",d);

Paragraph p3 = range.getParagraph(range.numParagraphs()-1);
String auditors = StringUtils.deleteWhitespace(p3.text());
pd.put("auditors",auditors);


document.close();
}
in.close();
jsonResult.setMsg("解析成功");
jsonResult.setSuccess(true);
} catch(Exception e){
e.printStackTrace();
jsonResult.setMsg("word格式异常,请检查word格式有无数据缺失,无效数据行!");
}
System.out.println("----------------------");
System.out.println(pd);
jsonResult.setObj(pd);
return jsonResult;
}

点击并拖拽以移动

2.获取第二个表格的信息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
    public JsonResult word44(String filePath) throws Exception {
JsonResult jsonResult = new JsonResult();
PageData pd = new PageData();
try {
FileInputStream in = new FileInputStream(filePath);
if(filePath.toLowerCase().endsWith("docx")){
//word 2007 图片不会被读取, 表格中的数据会被放在字符串的最后
XWPFDocument xwpf = new XWPFDocument(in);//得到word文档的信息
Iterator<XWPFTable> it = xwpf.getTablesIterator();//得到word中的表格
// 设置需要读取的表格 set是设置需要读取的第几个表格,total是文件中表格的总数
int set = 1, total = 2;
int num = set;
// 过滤前面不需要的表格
for (int i = 0; i < set - 1; i++) {
it.hasNext();
it.next();
}
//获取表格
JsonResult word33 = word33(filePath);
if(!word33.isSuccess()){
return word33;
}
pd = (PageData) word33.getObj();

while(it.hasNext()) {
XWPFTable table = it.next();
System.out.println("这是第" + num + "个表的数据");
List<XWPFTableRow> rows = table.getRows();

XWPFTableRow row = rows.get(0);
List<XWPFTableCell> cells = row.getTableCells();
XWPFTableCell cell = cells.get(1);
if(StringUtil.isEmpty(cell.getText())){
jsonResult.setMsg("XXX不能为空");
return jsonResult;
}else {
pd.put("XXXX",cell.getText());
}
cell = cells.get(3);
String a = cell.getText();
if(StringUtil.isEmpty(cell.getText())){
jsonResult.setMsg("X不能为空");
return jsonResult;
}else {
String b = a.replaceAll("年", "-");
String c = b.replaceAll("月", "-");
String d = c.replaceAll("日", "");
pd.put("X",d);
}
}else{
//获取表格和图片
JsonResult word33 = word33(filePath);
if(!word33.isSuccess()){
return word33;
}
pd = (PageData) word33.getObj();
// 获取word中的所有段落与表格
// 处理doc格式 即office2003版本
POIFSFileSystem pfs = new POIFSFileSystem(in);
HWPFDocument hwpf = new HWPFDocument(pfs);
Range range = hwpf.getRange();//得到文档的读取范围
TableIterator it = new TableIterator(range);
// 迭代文档中的表格
// 如果有多个表格只读取需要的一个 set是设置需要读取的第几个表格,total是文件中表格的总数
int set = 2, total = 2;
int num = set;
for (int i = 0; i < set-1; i++) {
it.hasNext();
it.next();
}
while (it.hasNext()) {
Table tb = (Table) it.next();
System.out.println("这是第" + num + "个表的数据");
//申报单位
TableRow tr = tb.getRow(0);//取得行
TableCell td = tr.getCell(1);//取得单元格
String XXX= getParagraph(td);
if (StringUtil.isEmpty(XXX)) {
jsonResult.setMsg("XXX不能为空");
return jsonResult;
}
pd.put("XXX", XXX);
//审核时间
td = tr.getCell(3);//取得单元格
String X= getParagraph(td);
if (StringUtil.isEmpty(X)) {
jsonResult.setMsg("X不能为空");
return jsonResult;
}
String b = X.replaceAll("年", "-");
String c = b.replaceAll("月", "-");
String d = c.replaceAll("日", "");
pd.put("X", d);
}
in.close();
}
in.close();
jsonResult.setMsg("解析成功");
jsonResult.setSuccess(true);
} catch(Exception e){
e.printStackTrace();
jsonResult.setMsg("word格式异常,请检查word格式有无数据缺失,无效数据行!");
}
System.out.println("----------------------");
System.out.println(pd);
jsonResult.setObj(pd);
return jsonResult;
}

点击并拖拽以移动

依次类推,word文档中有几个excel表格就写几个


解析word文档段落以及表格
https://tdsgpo.top/2022/04/02/解析word文档段落以及表格/
作者
DDS
发布于
2022年4月2日
许可协议