目录
0 环境准备
1 环境依赖
2 安装环境依赖包
3 程序逻辑实现
3.1 导入依赖包
3.2 定义将十六进制颜色字符串转换成RGB
3.3 定义相关参数
3.4 定义PDF处理类
3.4.1 定义PDFOCREnhanced类
3.4.2 定义init方法
3.4.3 定义探测水印方法
3.4.4 定义删除水印方法
3.4.5 定义paddle获取pdf文字方法
3.4.6 定义识别pdf图像主流程方法
3.5 定义main方法
4 完整代码
0 环境准备
- 已安装miniconda环境
1 环境依赖
此篇文章基于以下文章更改
python识别扫描版PDF文件,获取扫描版PDF文件的文本内容-CSDN博客
2 安装环境依赖包
pip install PyMuPDF
pip install paddlepaddle
pip install paddleocr
pip install opencv-python scikit-image
pip install common dual tight data prox
3 程序逻辑实现
3.1 导入依赖包
pip install PyMuPDF
pip install paddlepaddle
pip install paddleocr
pip install opencv-python scikit-image
pip install common dual tight data proxpip install paddlepaddlepip install paddleocr
3.2 定义将十六进制颜色字符串转换成RGB
def hex_to_rgb(hex_color):"""将十六进制颜色代码转换为RGB元组"""hex_color = hex_color.lstrip('#')return tuple(int(hex_color[i:i + 2], 16) for i in (0, 2, 4))
3.3 定义相关参数
# 配置参数,水印的颜色值
WATERMARK_HEX = "#f33c34"
TOLERANCE = 100 # 颜色容差范围。如果是彩色,可以设置容差范围大些,如果是灰度图,可以设置小些MIN_WATERMARK_PERCENT = 1 # 水印像素占比阈值# 转换水印颜色
target_rgb = hex_to_rgb(WATERMARK_HEX)
lower_bound = np.array([max(0, x - TOLERANCE) for x in target_rgb])
upper_bound = np.array([min(255, x + TOLERANCE) for x in target_rgb])# 转换水印颜色为灰度值
target_gray = hex_to_gray(WATERMARK_HEX)
lower_bound_gray = max(0, target_gray - TOLERANCE)
upper_bound_gray = min(255, target_gray + TOLERANCE)
3.4 定义PDF处理类
3.4.1 定义PDFOCREnhanced类
此类继承至参考博客的PDFOCR类
class PDFOCREnhanced(PDFOCR):
3.4.2 定义init方法
def __init__(self, file_path, output_txt):super().__init__(file_path, output_txt)self.ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=True, show_log=True)
3.4.3 定义探测水印方法
def _auto_detect_water(self, img_array, img):""" 基于图像分析自动识别水印区域"""# 创建颜色蒙版# 处理灰度图if len(img_array.shape) == 2:# 计算单通道的上下界mask = (img_array >= lower_bound_gray) & (img_array <= upper_bound_gray)else:# 处理彩色图像mask = np.all((img_array[:, :, :3] >= lower_bound) &(img_array[:, :, :3] <= upper_bound), axis=-1)# 水印检测逻辑watermark_ratio = np.sum(mask) / (img.width * img.height) * 100return watermark_ratio >= MIN_WATERMARK_PERCENT, mask
3.4.4 定义删除水印方法
def _remove_watermark(self, mask, img_array):# 创建白色背景white_bg = np.full_like(img_array, 255)# 合并处理后的图像if len(img_array.shape) == 2:output_array = np.where(mask, white_bg, img_array)else:output_array = np.where(mask[..., None], white_bg, img_array)return Image.fromarray(output_array)
3.4.5 定义paddle获取pdf文字方法
def ocr_recognition(self, image):"""执行OCR识别"""result = self.ocr.ocr(image, cls=True)texts = [line[1][0] for line in result[0]] if result else []# texts = [line[1][0] for line in result]return '\n'.join(texts)
3.4.6 定义识别pdf图像主流程方法
def _ocr_images(self):"""增强版OCR流程"""with open(self.output_txt, 'w', encoding='utf-8') as f:for img_file in sorted(os.listdir(self.temp_img_dir)):img_path = os.path.join(self.temp_img_dir, img_file)img = Image.open(img_path)# img = img.filter(ImageFilter.MedianFilter(size=3)) # 中值滤波降噪img_array = np.array(img)# 阈值处理# thresh = threshold_otsu(img_array)# binary = img_array > thresh# img = Image.fromarray((binary * 255).astype(np.uint8))img.save(img_path)# 水印处理环节has_watermark, mask = self._auto_detect_water(img_array, img)clean_img = imgif has_watermark:print(f"检测到水印: {img_file}")clean_img = self._remove_watermark(mask, img_array)clean_img.save(img_path) # 覆盖原图# 后续OCR识别流程保持不变text = self.ocr_recognition(np.array(clean_img))f.write(text + '\n')print(f"已完成 {img_file} 识别")
3.5 定义main方法
if __name__ == '__main__':cur_timestamp = time.time()processor = PDFOCREnhanced('./in_pdf/Txxxx71.pdf', f'./out/output-{cur_timestamp}.txt')try:processor._pdf_to_images(zoom=4)processor._ocr_images()finally:processor._cleanup()
4 完整代码
import os
import timeimport fitz
import numpy as np
from PIL import Image
from paddleocr import PaddleOCRfrom pdf_ocr_reader import PDFOCRdef hex_to_rgb(hex_color):"""将十六进制颜色代码转换为RGB元组"""hex_color = hex_color.lstrip('#')return tuple(int(hex_color[i:i + 2], 16) for i in (0, 2, 4))def hex_to_gray(hex_color):"""将十六进制颜色代码转换为灰度值"""hex_color = hex_color.lstrip('#')r, g, b = (int(hex_color[i:i + 2], 16) for i in (0, 2, 4))return 0.299 * r + 0.587 * g + 0.114 * b# 配置参数,水印的颜色值
WATERMARK_HEX = "#f33c34"
# WATERMARK_HEX = "#000000"
TOLERANCE = 100 # 颜色容差范围。如果是彩色,可以设置容差范围大些,如果是灰度图,可以设置小些
DPI = 300 # 图像分辨率
MIN_WATERMARK_PERCENT = 1 # 水印像素占比阈值# 转换水印颜色
target_rgb = hex_to_rgb(WATERMARK_HEX)
lower_bound = np.array([max(0, x - TOLERANCE) for x in target_rgb])
upper_bound = np.array([min(255, x + TOLERANCE) for x in target_rgb])# 转换水印颜色为灰度值
target_gray = hex_to_gray(WATERMARK_HEX)
lower_bound_gray = max(0, target_gray - TOLERANCE)
upper_bound_gray = min(255, target_gray + TOLERANCE)class PDFOCREnhanced(PDFOCR):def __init__(self, file_path, output_txt):super().__init__(file_path, output_txt)self.ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=True, show_log=True)def _auto_detect_water(self, img_array, img):""" 基于图像分析自动识别水印区域"""# 创建颜色蒙版# 处理灰度图if len(img_array.shape) == 2:# 计算单通道的上下界mask = (img_array >= lower_bound_gray) & (img_array <= upper_bound_gray)else:# 处理彩色图像mask = np.all((img_array[:, :, :3] >= lower_bound) &(img_array[:, :, :3] <= upper_bound), axis=-1)# 水印检测逻辑watermark_ratio = np.sum(mask) / (img.width * img.height) * 100return watermark_ratio >= MIN_WATERMARK_PERCENT, maskdef _remove_watermark(self, mask, img_array):# 创建白色背景white_bg = np.full_like(img_array, 255)# 合并处理后的图像if len(img_array.shape) == 2:output_array = np.where(mask, white_bg, img_array)else:output_array = np.where(mask[..., None], white_bg, img_array)return Image.fromarray(output_array)def _pdf_to_images(self, zoom=3):"""将PDF每页转换为高清图片"""doc = fitz.open(self.file_path)page_num = 7page = doc.load_page(page_num)# 设置缩放参数提升分辨率mat = fitz.Matrix(zoom, zoom)pix = page.get_pixmap(matrix=mat, alpha=False)pix.tobytes("ppm")p_index = page_num + 1p_index = str(p_index).zfill(5)img_path = os.path.join(self.temp_img_dir, f'page_{p_index}.png')pix.save(img_path)print(f"已完成 {img_path} 存储")doc.close()def ocr_recognition(self, image):"""执行OCR识别"""result = self.ocr.ocr(image, cls=True)texts = [line[1][0] for line in result[0]] if result else []return '\n'.join(texts)def _ocr_images(self):"""增强版OCR流程"""with open(self.output_txt, 'w', encoding='utf-8') as f:for img_file in sorted(os.listdir(self.temp_img_dir)):img_path = os.path.join(self.temp_img_dir, img_file)img = Image.open(img_path)img_array = np.array(img)img.save(img_path)# 水印处理环节has_watermark, mask = self._auto_detect_water(img_array, img)clean_img = imgif has_watermark:print(f"检测到水印: {img_file}")clean_img = self._remove_watermark(mask, img_array)clean_img.save(img_path) # 覆盖原图# 后续OCR识别流程保持不变text = self.ocr_recognition(np.array(clean_img))f.write(text + '\n')print(f"已完成 {img_file} 识别")if __name__ == '__main__':cur_timestamp = time.time()processor = PDFOCREnhanced('./in_pdf/T2xxx71.pdf', f'./out/output-{cur_timestamp}.txt')try:processor._pdf_to_images(zoom=4)processor._ocr_images()finally:processor._cleanup()