JAVA实现PDF转HTML文档

本文是将PDF文档转PNG图片,然后进行图片拼接，拼接后的图片转为base64字符串，然后拼接html文档（再写入html文件）。

引入maven依赖

<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox --><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId><version>2.0.12</version></dependency>

工具实现类

package cn.yueworld.pms.web.util;import cn.yueworld.framework.tools.exception.LogicException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import sun.misc.BASE64Encoder;import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.*;
import java.net.URL;/*** PDF文档转HTML文档** @author zkg* @since 2024/6/11 16:45*/
public class PdfConvertHtmlUtil {/*** 日志对象*/private static Logger logger = LoggerFactory.getLogger(PdfConvertHtmlUtil.class);/*** pdf转图片** @param pdfUrl pdf路径* @return BufferedImage 图片*/public static BufferedImage pdfToImage(String pdfUrl) {PDDocument doc = null;try {// 本地文件// File file = new File("C:\\Users\\Dell\\Downloads\\测试.pdf");// inputStream = new FileInputStream(file);logger.info("解析pdf+pdfUrl：" + pdfUrl);URL url = new URL(pdfUrl);InputStream inputStream = url.openStream();doc = PDDocument.load(inputStream);PDFRenderer renderer = new PDFRenderer(doc);int pageCount = doc.getNumberOfPages();BufferedImage image = null;for (int i = 0; i < pageCount; i++) {if (image != null) {image = combineBufferedImages(image, renderer.renderImageWithDPI(i, 144));}if (i == 0) {image = renderer.renderImageWithDPI(i, 144); // Windows native DPI}// BufferedImage srcImage = resize(image, 240, 240);//产生缩略图}return combineBufferedImages(image);} catch (IOException e) {e.printStackTrace();} finally {try {if (doc != null) {doc.close();}} catch (IOException e) {e.printStackTrace();}}return null;}/*** BufferedImage拼接处理，添加分割线** @param images 文件* @return BufferedImage 添加分割线*/public static BufferedImage combineBufferedImages(BufferedImage... images) {int height = 0;int width = 0;for (BufferedImage image : images) {//height += Math.max(height, image.getHeight());height += image.getHeight();width = image.getWidth();}BufferedImage combo = new BufferedImage(width, height, BufferedImage.TYPE_INT_ARGB);Graphics2D g2 = combo.createGraphics();int x = 0;int y = 0;for (BufferedImage image : images) {//int y = (height - image.getHeight()) / 2;g2.setStroke(new BasicStroke(2.0f));// 线条粗细g2.setColor(new Color(193, 193, 193));// 线条颜色g2.drawLine(x, y, width, y);// 线条起点及终点位置g2.drawImage(image, x, y, null);//x += image.getWidth();y += image.getHeight();}return combo;}/*** 通过Base64创建HTML文件并输出html文件** @param base64  文件的base64* @param htmlUrl html保存路径*/public static void createHtmlByBase64(String base64, String htmlUrl) {StringBuilder stringHtml = new StringBuilder();PrintStream printStream = null;try {// 打开文件printStream = new PrintStream(new FileOutputStream(htmlUrl));} catch (FileNotFoundException e) {e.printStackTrace();}// 输入HTML文件内容stringHtml.append("<html><head>");stringHtml.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">");stringHtml.append("<title></title>");stringHtml.append("</head>");stringHtml.append("<body style=\"\r\n" + "    text-align: center;\r\n" + "    background-color: #C1C1C1;\r\n" + "\">");stringHtml.append("<img src=\"data:image/png;base64," + base64 + "\" />");stringHtml.append("<a name=\"head\" style=\"position:absolute;top:0px;\"></a>");//添加锚点用于返回首页stringHtml.append("<a style=\"position:fixed;bottom:10px;right:10px\" href=\"#head\">回到首页</a>");stringHtml.append("</body></html>");try {// 将HTML文件内容写入文件中printStream.println(stringHtml.toString());} catch (Exception e) {e.printStackTrace();} finally {if (printStream != null) {printStream.close();}}}/*** 图片转为base64编码** @param bufferedImage 图片* @return base64编码*/public static String bufferedImageToBase64(BufferedImage bufferedImage) {ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();String png_base64 = "";try {ImageIO.write(bufferedImage, "png", byteArrayOutputStream);// 写入流中byte[] bytes = byteArrayOutputStream.toByteArray();// 转换成字节BASE64Encoder encoder = new BASE64Encoder();// 转换成base64串 删除 \r\npng_base64 = encoder.encodeBuffer(bytes).trim().replaceAll("\n", "").replaceAll("\r", "");} catch (IOException e) {e.printStackTrace();}return png_base64;}/*** 将图片的base64编码替换到html文件里面** @param htmlUrl 文件路径* @param base64  图片* @return html*/public static Document htmlReplaceTag(String htmlUrl, String base64) {try {// 读取文件// File file = new File(htmlUrl);// Document document = Jsoup.parse(file, "UTF-8");logger.info("解析html+htmlUrl" + htmlUrl);// 使用Jsoup.connect方法获取Document对象Document document = Jsoup.connect(htmlUrl).get();logger.info("替换元素内容开始:" + document);// 通过ID获取要替换的元素Element oldTag = document.getElementById("pdfFileId");if (oldTag != null) {// 替换元素内容if (base64 != null) {// 创建新标签Element newTag = document.createElement("img");// 设置图片的idnewTag.attr("id", "pdfFileId");// 设置图片的属性newTag.attr("src", "data:image/png;base64," + base64);// 替换旧标签oldTag.replaceWith(newTag);} else {oldTag.remove();}logger.info("替换元素内容结束:" + document);}// 保存修改后的HTML文件document.outputSettings().charset("UTF-8");document.outputSettings().prettyPrint(true);// 写到本地// FileWriter fileWriter = new FileWriter(new File("C:\\Users\\Dell\\Downloads\\测试.html"));// fileWriter.write(document.outerHtml());// fileWriter.close();return document;} catch (Exception e) {e.printStackTrace();logger.error("将图片的base64编码替换到html文件里面失败:" + e.getMessage());throw new LogicException("将图片的base64编码替换到html文件里面失败");}}/*** html + pdf** @param htmlUrl 文件路径* @param pdfUrl  图片* @return html*/public static Document htmlAddPdf(String htmlUrl, String pdfUrl) {try {String base64_png = null;if (pdfUrl != null) {// pdf转图片BufferedImage bufferedImage = pdfToImage(pdfUrl);// 图片转为base64编码base64_png = bufferedImageToBase64(bufferedImage);}// 将图片的base64编码替换到html文件里面return htmlReplaceTag(htmlUrl, base64_png);} catch (Exception e) {e.printStackTrace();logger.error("pdf文件转换失败:" + e.getMessage());throw new LogicException("pdf文件转换失败:" + e.getMessage());}}
}

测试Demo

public static void main(String[] args) {// PDF文件URLString pdfUrl = "https://geli-dev.yueworld.cn:8060/group1/M00/00/47/wKh5GWZfvn6AILBZAASNWOGlbkI479.pdf";// HTML文件路径String htmlUrl = "https://geli-dev.yueworld.cn:8060/cm_web/print/cm_Zlcontract001/#/cm_Zlcontract001?conId=2371";Document document = htmlAddPdf(htmlUrl, pdfUrl);System.out.println("结果：" + document);}

可以是在线的pfd、html，也可以是本地的pdf和html，两种方法工具类里面都有。

第三方库Jsoup

其中在线的html需要使用第三方库Jsoup，Jsoup是一个开源的Java库，它提供了简单和方便的API来处理HTML文档。

你可以使用以下代码添加Jsoup依赖项到你的项目中：

依赖

<dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.13.1</version>
</dependency>

下面是一个使用Jsoup获取网页HTML数据的示例代码：

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;public class HtmlFetcher {public static void main(String[] args) {try {// 使用Jsoup.connect方法获取Document对象Document doc= Jsoup.connect(htmlUrl).get();String htmlData = doc.html();System.out.println(htmlData);} catch (Exception e) {e.printStackTrace();}}
}

JAVA实现PDF转HTML文档

引入maven依赖

工具实现类

测试Demo

第三方库Jsoup

依赖

相关资讯

热文排行

最新新闻

推荐新闻

热搜词