- 先pdf转图片
import os
from pdf2image import convert_from_path
pdf_path = '/Users/xxx/2022.pdf'
output_folder = './output_images2022'
output_name = 'page'
if not os.path.exists(output_folder):os.makedirs(output_folder)
images = convert_from_path(pdf_path, dpi=300)
for i, image in enumerate(images):image.save(f'{output_folder}/{output_name}_{i+1}.png', 'PNG')
- OCR
from PIL import ImageEnhance
import pytesseract
from PIL import Image
from openpyxl import Workbook
def enhance_image(img):img = img.convert('L') img = ImageEnhance.Contrast(img).enhance(2.0)return imgdef allimngs(image_path):image = Image.open(image_path)image = enhance_image(image)text = pytesseract.image_to_string(image, lang="chi_sim") return text.replace(' ', '')class TrieNode:def __init__(self):self.children = {}self.keywords = []class Trie:def __init__(self):self.root = TrieNode()def insert(self, keyword):node = self.rootfor char in keyword:if char not in node.children:node.children[char] = TrieNode()node = node.children[char]node.keywords.append(keyword)def count_keywords(text, keywords):keywords = list(set(keywords))trie = Trie()for kw in keywords:trie.insert(kw)counters = {kw: 0 for kw in keywords}i = 0n = len(text)while i < n:current_node = trie.rootmax_len = 0current_len = 0end_pos = ifor j in range(i, n):char = text[j]if char in current_node.children:current_node = current_node.children[char]current_len += 1if current_node.keywords: max_len = current_lenend_pos = j + 1 else:break if max_len > 0:for kw in current_node.keywords:counters[kw] += 1i = end_pos else:i += 1 return countersif __name__ == "__main__":keywords = ['矮小','安于现状','暗藏','暗淡','暗黑']all_text = ''workbook = Workbook()sheet = workbook.activefor i in range(108):i = i+1image_path = f"/Users/xxx/output_images2022/page_{i}.png"all_text = all_text + allimngs(image_path)all_text = all_text.replace(' ', '').replace('\n', '')result = count_keywords(all_text, keywords)num = 1for k, v in result.items():sheet[f'A{num}'] = ksheet[f'B{num}'] = vprint(k, v, num)num = num + 1workbook.save(filename='2022.xlsx')