概念介绍
pending
table的接口
- doc.add_table() 方法在文档中添加表格对象;
- Table类路径:docx.table.Table(tbl: CT_Tbl, parent: t.ProvidesStoryPart)
是一个代理类,代理 WordprocessingML <w:tbl> 元素,对应docx.oxml.table.CT_Tbl对象;- add_column(width: Length) ,在最右侧添加一列,返回_Column对象
- add_row(),在最下边添加一行,返回_Row 对象
- alignment,是docx.enum.table.WD_TABLE_ALIGNMENT的枚举值,表示表格在页面中居左、居中、居右;None时从样式层级中继承;
- autofit,单元格宽度是否自动适应内容;bool类型;
- cell(row_idx: int, col_idx: int) → docx.table._Cell 获取指定位置的单元格,(0, 0) 表格左上角的单元格;
- column_cells(column_idx: int) → list,获取一列的单元格;
- columns,返回所有列,可迭代;
- 获取指定行的单元格:table.rows[row_idx].cells ;
- rows 获取所有行序列;
- style,_TableStyle对象,表格样式,未设置样式时返回默认样式;
- table_direction,表格的方向,是docx.enum.table. WD_TABLE_DIRECTION的枚举值,LTR或RTL;
单元格的接口
-
类路径 docx.table._Cell(tc: CT_Tc, parent: TableParent),是一个代理类,代理docx.oxml.table.CT_Tc
-
add_paragraph(text: str = ‘’, style: str | ParagraphStyle | None = None),在单元格内容的末尾添加段落,内部的文本样式会被表格样式影响;文本可以包含 \t 制表符,\n or \r 表示换行;
-
add_table(rows: int, cols: int) → docx.table.Table,在单元格内容末尾添加一个表格,返回表格对象,然后在表格后面添加一个空段落(word中的表格单元格必须以一个段落结束,且至少包含一个块级元素);
-
grid_span,单元格的水平跨度,若为两个单元格合并,则跨度为2;
-
iter_inner_content() → Iterator[Paragraph | Table],按照文档顺序获取单元格内部的内容;
-
merge(other_cell: docx.table._Cell) 按照矩形区域合并单元格;
-
paragraphs,列出单元格内的所有段落;
-
tables,按照出现顺序列出单元格内的所有表格
-
text,单元格内所有文本字符串;赋值时会替换所有当前现存内容;
-
vertical_alignment,WD_CELL_VERTICAL_ALIGNMENT 枚举值;
-
width,单元格宽度;
CT_Tc 接口
- 基于底层xml的单元格类,docx.oxml.table.CT_Tc
- p_lst: list[CT_P] 获取单元格的所有段落;
- tbl_lst: list[CT_Tbl] 获取单元格的所有表格;
- tcPr: CT_TcPr | None 获取单元格的属性;
- p = OneOrMore(“w:p”)
- tbl = OneOrMore(“w:tbl”)
- clear_content,清空单元格内容,保留tcPr属性;
行对象接口
-
类路径docx.table._Row(tr: CT_Row, parent: TableParent) 代理类,代理docx.oxml.table.CT_Row对象;
-
cells,当前行的所有单元格序列;
- cell 可以从第一列开始,也可以从第二列、第n列开始;
- cell 可以到最后一列都有,也可以到最后一列之前的某列结束;
- 只有真实存在的单元格 才会在单元格序列中;
- 同一个表不同的行中,单元格的数量可能不同;
-
grid_cols_after,当前行最后一个单元格后面未使用的布局网格数;
- 每行的单元格可以滞后开始(非第一列开始),提前结束(非最后一列结束),
但是行的中间不能有单元格缺失
;
- 每行的单元格可以滞后开始(非第一列开始),提前结束(非最后一列结束),
-
grid_cols_before,当前行的开头处未使用的布局网格数,跳过这些布局网格,直接去渲染当前行的第一个单元格;
-
height,单元格的高度;
-
height_rule,高度规则,枚举值WD_ROW_HEIGHT_RULE;
-
table,当前行所属的表格对象;
列对象接口
-
类 docx.table._Column(gridCol: CT_TblGridCol, parent: TableParent) 同样是代理类;
-
cells,一列中所有的单元格 序列;
-
table,该列所属表格对象;
-
width, 列的宽度;
解析表格数据案例
- 创建一个word文档,并添加如下表格
- 解析该表格数据
from typing import Union, List, Tuple, Dict
from docx.table import Table
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml.table import CT_Tc, CT_Row, CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx import types
from pydantic import BaseModel, Field
class MyCell(BaseModel):
paras: list
tables: list
pics: list
graphics: list
class MyText(BaseModel):
text: str
def __str__(self):
return self.text
class MyRun(BaseModel):
cnt: str
class MyPicture(BaseModel):
type: str = "pic"
class MyGraphic(BaseModel):
type: str = "graphic"
class TableData(BaseModel):
tbl_data: list
class RowData(BaseModel):
cur_row: list
def parse_run(run: CT_R):
my_run = MyRun(cnt="")
my_pic = None
for ct_r_e in run:
tag_name = ct_r_e.tag.split("}")[-1]
if tag_name == "rPr":
print("节段属性...")
# 节段内的文本
elif tag_name == "t": # <w:t>
print("节段内部文本...")
my_run.cnt = ct_r_e.text
# 节段内图片
elif tag_name == "drawing": # <w:drawing>
print("节段内图片...")
my_pic = MyPicture()
return my_run, my_pic
def parse_paragraph(para: CT_P):
my_text = MyText(text="")
pic_list = []
for ct_p_e in para:
tag_name = ct_p_e.tag.split("}")[-1]
if tag_name == "r":
run_text, run_pic = parse_run(ct_p_e)
if isinstance(run_text, MyRun) and run_text.cnt:
my_text.text += run_text.cnt
if isinstance(run_pic, MyPicture):
pic_list.append(run_pic)
elif tag_name == "ins":
print("ins....")
return my_text, pic_list
def parse_table_data(table: CT_Tbl, parent: types.ProvidesStoryPart):
""" 递归解析表格数据,返回二维数组 """
# 代理Table 解析数据
proxy_table = Table(table, parent)
tbl_data = TableData(tbl_data=[])
for row in proxy_table.rows:
row_data = RowData(cur_row=[])
for cell in row.cells:
# 迭代单元格
cell_obj = MyCell(paras=[], pics=[], graphics=[], tables=[]) # 自定义单元格对象
for cnt in cell.iter_inner_content():
if isinstance(cnt, Paragraph):
# 从段落中可以解析为文本、图片、图形
para_text, para_pic_list = parse_paragraph(cnt._element)
if isinstance(para_text, MyText) and para_text.text:
cell_obj.paras.append(para_text)
if para_pic_list:
cell_obj.pics.extend(para_pic_list)
elif isinstance(cnt, Table):
cell_obj.tables.append(parse_table_data(cnt._element, cell._element))
# 追加到行
row_data.cur_row.append(cell_obj)
# 追加到表格数据
tbl_data.tbl_data.append(row_data)
return tbl_data
if __name__ == '__main__':
import json
word_path = r"C:\Users\lenovo\Desktop\cc\lauf_chapter_old.docx"
doc = Document(word_path)
for ele in doc.element.body:
if isinstance(ele, CT_Tbl):
data = parse_table_data(ele, doc.element.body)
#print(data.dict())
with open(word_path.replace(".docx", ".json"), "w", encoding="utf-8") as f:
f.write(json.dumps(data.dict(), indent=4, ensure_ascii=False))
获取表格的页码
- 起始位置页码
- 结束位置页码
import pythoncom
from win32com.client import Dispatch, GetActiveObject
def get_word_instance():
""" 获取word进程 实例"""
pythoncom.CoInitialize()
try:
# 获取运行的Word实例
word_app = GetActiveObject("Word.Application")
except pythoncom.com_error:
# 打开word程序
word_app = Dispatch("Word.Application")
word_app.Visible = False # 不显示 Word 界面
word_app.DisplayAlerts = False
finally:
return word_app
def close_word_instance(word_app):
try:
word = GetActiveObject("Word.Application")
if word_app is word:
word_app.Quit()
except pythoncom.com_error:
pass
# 基于pywin32 获取表格的起始页码、结束页码
def get_table_page_num(doc_path):
word = get_word_instance()
# 打开文档
doc = word.Documents.Open(doc_path)
page_numbers = []
#
for table in doc.Tables:
# 起始位置、结束位置
# start_range = table.Range.Start # int类型
# end_range = table.Range.End
# 获取表格的起始和结束段落
# start_para = doc.Range(0, start_range).Paragraphs.Last
# end_para = doc.Range(end_range, doc.Content.End).Paragraphs.First
start_para = table.Range.Paragraphs.First
end_para = table.Range.Paragraphs.Last
# 获取起始段落和结束段落的页码
start_page_num = start_para.Range.Information(1)
end_page_num = end_para.Range.Information(1)
# 保存页码
page_numbers.append((start_page_num, end_page_num))
# 关闭文档
doc.Close(False)
# 关闭word应用
close_word_instance(word)
return page_numbers
获取表格中每个Cell的页码
# 获取每个cell的页码
def get_tbl_cell_page_num(doc_path):
word = get_word_instance()
# 打开文档
doc = word.Documents.Open(doc_path)
# 每个表格的所有cell的页码 存储一行
page_nums = []
# 遍历表格
for table in doc.Tables:
row_count, column_count = table.Range.Rows.Count, table.Range.Columns.Count
cur_table_page_num = []
cur_col_idx = 0
# 遍历表格内的段落
for para in table.Range.Paragraphs:
cur_col_idx += 1
if cur_col_idx % (column_count + 1) != 0:
page_num = para.Range.Information(1)
cur_table_page_num.append(page_num)
else:
if cur_col_idx < row_count * (column_count + 1):
continue
else:
page_nums.append(cur_table_page_num)
break
return page_nums
def parse_tbl_cell(doc_path): # 基于python-docx解析单元格内容 + 对应页码
page_nums = get_tbl_cell_page_num(doc_path)
doc = Document(doc_path)
table_data = []
for tidx, table in enumerate(doc.tables):
cur_table = []
col_num = len(table.columns)
for ridx, row in enumerate(table.rows):
cur_row = []
for c_idx, cell in enumerate(row.cells):
cell_data = {
"cnt": cell.text,
"page_num": page_nums[tidx][ridx * col_num + c_idx]
}
cur_row.append(cell_data)
cur_table.append(cur_row)
table_data.append(cur_table)
return table_data
if __name__ == '__main__':
import json
word_path = r"C:\Users\lenovo\Desktop\cc\lauf_chapter_old.docx"
r = parse_tbl_cell(word_path)
with open(word_path.replace(".docx", ".json"), "w", encoding="utf-8") as f:
f.write(json.dumps(r, indent=4, ensure_ascii=False))
解析表格数据如下: