#安装 pip install pdfplumber
importpdfplumber#利用pdfplumber提取文字
with pdfplumber.open('基于python的网页爬虫.pdf') as pdf:
first_page=pdf.pages[0]print(first_page.extract_text())#利用pdfplumber单个提取表格
with pdfplumber.open('基于python的网页爬虫.pdf') as pdf:
first_page=pdf.pages[0]print(first_page.extract_table())#利用pdfplumber多个提取表格
with pdfplumber.open('基于python的网页爬虫.pdf') as pdf:
first_page=pdf.pages[0]for table infirst_page.extract_tables():print(table)#利用pdfplumber单个提取财报 table_settings: 提取表格是的设定
with pdfplumber.open('基于python的网页爬虫.pdf') as pdf:
first_page=pdf.pages[0]
table=first_page.extract_tables(
table_settings={'vertical_strategy': 'text','horizontal_strategy': 'text'}
)
new_table=[]for row intable:
new_row=[]#如果不是空行
if not ''.join([str(item) for item in row]) == '':#合并单词
new_row.append(''.join([str(item) if item else '' for item in row[:3]]))
new_row+= row[3:]
new_table.append(new_row)print(new_table)