JAVA读取(DOC、DOCX、PDF、PPT、PPTX)文件文本内容及图片

以下为瞎扯淡:

温馨提示:有很多方法均可以解析这些常见的文件,以下内容使用的是apache-poi + apache-pdfbox实现的。

        关于文档解析,在网上搜索了很久,无奈内容太过繁杂,找不到合适的代码,一大半都是只支持文本。没办法,只能自己在网上一点一点CV了,最终提取了这些代码,不能说好用吧,应该可解燃眉之急。关于doc文档以及pdf文档还是有很多问题的,后续希望大佬们能在帖子下面多多指正,能优化一下代码,那就更好了。

以下为正文内容:

首先把以下这些依赖干进去

        
			org.apache.poi
			poi
			4.1.0
		
        
			org.apache.poi
			poi-scratchpad
			4.1.0
		
        
			org.apache.pdfbox
			pdfbox
			2.0.22
		
要测试的话给你贴一个文档地址吧:(但是这个在线文档是没有图片滴)
public static void main(String[] args) throws IOException {
        String document = processDocumentFromFilePath("E:\\VPN系统使用手册.pptx", "E:\\临时图片");
        System.out.println(document);
        String documentFromUrl = processDocumentFromUrl("http://api.idocv.com/data/doc/manual.docx", "E:\\临时图片");
        System.out.println(documentFromUrl);
    }
然后上车:飕飕飕

JAVA读取(DOC、DOCX、PDF、PPT、PPTX)文件文本内容及图片

import com.alibaba.dubbo.common.utils.CollectionUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.usermodel.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.sl.usermodel.TextParagraph;
import org.apache.poi.xslf.usermodel.*;
import org.apache.poi.xwpf.usermodel.*;

import java.io.*;
import java.util.Date;
import java.util.List;
import java.util.stream.Collectors;

public class FileProcessorUtils {

   
    /***
     * 此方法针对本地文件
     * 提取文件信息并返回内容
     * @param filePath 文件储存地址
     * @param imgRoot 图片存储地址
     * @return
     */
    public static String processDocumentFromFilePath(String filePath,String imgRoot) throws IOException {
        File file = new File(filePath);
        FileInputStream fileInputStream = new FileInputStream(file);

        // 根据文件类型调用适当的处理方法
        switch (fileTypeName(filePath)) {
            case "doc":
                return processWordDocDocumentFromStream(fileInputStream,imgRoot);
            case "docx":
                return processWordDocxDocumentFromStream(fileInputStream,imgRoot);
            case "pdf":
                return processPdfDocumentFromStream(fileInputStream,imgRoot);
            case "ppt":
                return processPptDocumentFromStream(fileInputStream,imgRoot);
            case "pptx":
                return processPptxDocumentFromStream(fileInputStream,imgRoot);
            default:
                throw new RuntimeException("不支持的文件格式,文件解析目前只支持(DOC/DOCX/PDF/PPT/PPTX)");
        }
    }


    /***
     * 此方法针对网络文件
     * 提取文件信息并返回内容
     * @param downloadUrl 文件下载链接
     * @param imgRoot 图片存储地址
     * @return
     */
    public static String processDocumentFromUrl(String downloadUrl,String imgRoot) throws IOException {

        HttpClient httpClient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(downloadUrl);
        HttpResponse response = httpClient.execute(httpGet);

        //获取文件类型
        // TODO: 2023/9/14  此处并不是所有的下载链接都存在后缀信息,如果为了提升代码的健壮性,可以在此处修改代码以获取文件类型
        String typeName = fileTypeName(downloadUrl);
        // 根据文件类型调用适当的处理方法
        switch (typeName) {
            case "doc":
                return processWordDocDocumentFromStream(response.getEntity().getContent(),imgRoot);
            case "docx":
                return processWordDocxDocumentFromStream(response.getEntity().getContent(),imgRoot);
            case "pdf":
                return processPdfDocumentFromStream(response.getEntity().getContent(),imgRoot);
            case "ppt":
                return processPptDocumentFromStream(response.getEntity().getContent(),imgRoot);
            case "pptx":
                return processPptxDocumentFromStream(response.getEntity().getContent(),imgRoot);
            default:
                throw new RuntimeException("不支持的文件格式,文件解析目前只支持(DOC/DOCX/PDF/PPT/PPTX)");
        }
    }

    /***
     * word(doc)文件处理
     * @param inputStream(文件流)
     * @return
     */
    private static String processWordDocDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {
        HWPFDocument document = new HWPFDocument(inputStream);
        StringBuilder htmlText = new StringBuilder();
        WordExtractor extractor = new WordExtractor(document);

        try {
            String[] paragraphs = extractor.getParagraphText();
            for (int paragraphIndex = 0; paragraphIndex < paragraphs.length; paragraphIndex++) {
                String paragraphText = paragraphs[paragraphIndex];
                //获取文本对齐方式
                String justification = getJustification(document.getRange().getParagraph(paragraphIndex).getJustification());
                // 根据需要添加其他HTML标签
                htmlText.append("

").append(paragraphText).append("").append("

"); } // 提取图片 List pictures = document.getPicturesTable().getAllPictures(); for (int i = 0; i < pictures.size(); i++) { Picture picture = pictures.get(i); byte[] pictureData = picture.getContent(); String newFileName = new Date().getTime() + i + "_image." + picture.suggestFileExtension(); // 可以根据需要更改扩展名,suggestFileExtension()方法自动获取合适的图片类型 String imgPath = saveImageToFile(pictureData, newFileName, imageRoot); htmlText.append("

JAVA读取(DOC、DOCX、PDF、PPT、PPTX)文件文本内容及图片

"); } } finally { extractor.close(); document.close(); } return htmlText.toString(); } /*** * word(docx)文件处理 * @param inputStream(文件流) * @return */ private static String processWordDocxDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException { //获取文件内容 XWPFDocument document = new XWPFDocument(inputStream); StringBuilder htmlText = new StringBuilder(); try { //获取所有元素 List paragraphs = document.getParagraphs(); //根据元素类型追加 for (XWPFParagraph paragraph : paragraphs) { //获取文本对齐方式 ParagraphAlignment alignment = paragraph.getAlignment(); htmlText.append("

"); List runs = paragraph.getRuns(); for (XWPFRun run : runs) { // 处理字体大小、样式等信息 String fontSize = run.getFontSize() + "pt"; String fontFamily = run.getFontFamily(); // 添加样式信息到HTML htmlText.append("" + run.text() + ""); } htmlText.append("

"); // 检查当前行段落是否有图片存在 List pictures = paragraph.getRuns().stream() .flatMap(run -> run.getEmbeddedPictures().stream()) .collect(Collectors.toList()); if(CollectionUtils.isNotEmpty(pictures)){ if(pictures.size()>0){ pictures.forEach( bean ->{ XWPFPictureData pictureData = bean.getPictureData(); String newFileName = new Date().getTime() + "_image." + pictureData.suggestFileExtension(); String imgPath = null; try { imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot); } catch (IOException e) { throw new RuntimeException(e); } htmlText.append("

"); }); } } } } finally { document.close(); } return htmlText.toString(); } /*** * Pdf文件处理 * @param inputStream(文件流) * @return */ private static String processPdfDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException { PDDocument pdfDocument = PDDocument.load(inputStream); PDFTextStripper textStripper = new PDFTextStripper(); StringBuilder htmlText = new StringBuilder(); String[] lines = textStripper.getText(pdfDocument).split("\n"); for (String line : lines) { htmlText.append("

").append(line).append("

"); } pdfDocument.close(); return htmlText.toString(); } /** * 处理PPT(.ppt)文件 * @param inputStream(文件流) * @return * @throws IOException */ private static String processPptDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException { HSLFSlideShow ppt = new HSLFSlideShow(inputStream); StringBuilder pptText = new StringBuilder(); try { // 提取文本内容 for (HSLFSlide slide : ppt.getSlides()) { for (HSLFShape shape : slide.getShapes()) { //如果是文本处理文本 if (shape instanceof HSLFTextShape) { HSLFTextShape textShape = (HSLFTextShape) shape; for (HSLFTextParagraph paragraph : textShape.getTextParagraphs()) { //获取文本对齐方式 TextParagraph.TextAlign textAlign = paragraph.getTextAlign(); pptText.append("

"); for (HSLFTextRun run : paragraph.getTextRuns()) { // 处理字体大小、字体样式等信息 String fontSize = run.getFontSize() + "pt"; String fontFamily = run.getFontFamily(); run.getRawText(); // 添加样式信息到HTML pptText.append("" + run.getRawText() + ""); } pptText.append("

"); // 换行处理 } }else if (shape instanceof HSLFPictureShape) { // 如果是图片,处理图片 HSLFPictureShape pictureShape = (HSLFPictureShape) shape; HSLFPictureData pictureData = pictureShape.getPictureData(); String contentType = pictureData.getContentType(); String newFileName = new Date().getTime() + "_image." + imageTypeName(contentType); String imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot); pptText.append("

"); } } } } finally { ppt.close(); } return pptText.toString(); } /** * 处理PPTX(.pptx)文件 * @param inputStream(文件流) * @return * @throws IOException */ private static String processPptxDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException { XMLSlideShow pptx = new XMLSlideShow(inputStream); StringBuilder pptxText = new StringBuilder(); try { // 提取文本内容 for (XSLFSlide slide : pptx.getSlides()) { for (XSLFShape shape : slide.getShapes()) { if (shape instanceof XSLFTextShape) { XSLFTextShape textShape = (XSLFTextShape) shape; for (XSLFTextParagraph paragraph : textShape.getTextParagraphs()) { //获取文本对齐方式 TextParagraph.TextAlign textAlign = paragraph.getTextAlign(); pptxText.append("

"); for (XSLFTextRun run : paragraph.getTextRuns()) { // 处理字体大小、字体样式等信息 String fontSize = run.getFontSize() + "pt"; String fontFamily = run.getFontFamily(); // 添加样式信息到HTML pptxText.append("" + run.getRawText() + ""); } pptxText.append("

"); // 换行处理 } }else if (shape instanceof XSLFPictureShape) { // 如果是图片,处理图片 XSLFPictureShape pictureShape = (XSLFPictureShape) shape; XSLFPictureData pictureData = pictureShape.getPictureData(); String newFileName = new Date().getTime() + "_image." + pictureData.suggestFileExtension(); String imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot); pptxText.append("

"); } } } } finally { pptx.close(); } return pptxText.toString(); } /** * 保存图片到指定位置,并返回引用地址 * @param imageData * @param imageRoot * @return * @throws IOException */ public static String saveImageToFile(byte[] imageData, String imageFileName, String imageRoot) throws IOException { String imagePath = imageRoot + File.separator + imageFileName; File file = new File(imageRoot); if(!file.exists()){ file.mkdir(); } try (FileOutputStream fos = new FileOutputStream(imagePath)) { fos.write(imageData); } return imagePath; } /** * 表格处理 * @param table * @return */ private static String getTableHtmlText(XWPFTable table) { StringBuilder tableHtml = new StringBuilder(""); for (XWPFTableRow row : table.getRows()) { tableHtml.append(""); for (XWPFTableCell cell : row.getTableCells()) { tableHtml.append(""); } tableHtml.append(""); } tableHtml.append("
").append(cell.getText()).append("
"); return tableHtml.toString(); } /*** * 获取文件后缀 * @param filePath * @return */ private static String fileTypeName(String filePath) { int dotIndex = filePath.lastIndexOf("."); if (dotIndex > 0) { return filePath.substring(dotIndex + 1).toLowerCase(); } return ""; } /*** * 获取图片类型 * @param imagePath * @return */ private static String imageTypeName(String imagePath) { int dotIndex = imagePath.lastIndexOf("/"); if (dotIndex > 0) { return imagePath.substring(dotIndex + 1).toLowerCase(); } return ""; } /*** * doc文档获取当前行对齐方式 默认左对齐 * @param type * @return */ private static String getJustification(Integer type) { switch (type) { case 0: return "left"; case 1: return "center"; case 2: return "right"; default: return "left"; } } }

本文来自网络,不代表协通编程立场,如若转载,请注明出处:https://www.net2asp.com/5acf7172af.html