Java实现Word/Pdf/TXT转html

2019-12-23 08:56:13来源：博客园阅读 ()

Java实现Word/Pdf/TXT转html

引言:

最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型.其中就涉及到一个问题,就是文档型课件课程在网页上的展示和学习问题,因为要在线统计学习的课程,学习的人员,学习的时长,所以不能像传统做法将文档下载到本地学习,那样就不受系统控制了,所以最终的方案是,在上传文档型课件的时候,将其文件对应的转换成HTML文件,以便在网页上能够浏览学习

下边主要针对word,pdf和txt文本文件进行转换

一:Java实现将word转换为html

1:引入依赖

 1 <dependency>
 2   <groupId>fr.opensagres.xdocreport</groupId>
 3   <artifactId>fr.opensagres.xdocreport.document</artifactId>
 4   <version>1.0.5</version>
 5 </dependency>
 6 <dependency> 
 7   <groupId>fr.opensagres.xdocreport</groupId> 
 8   <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> 
 9   <version>1.0.5</version> 
10 </dependency>
11   <dependency>
12   <groupId>org.apache.poi</groupId>
13   <artifactId>poi</artifactId>
14   <version>3.12</version>
15 </dependency>
16 <dependency>
17   <groupId>org.apache.poi</groupId>
18   <artifactId>poi-scratchpad</artifactId>
19   <version>3.12</version>
20 </dependency>

2:代码demo

  1 package com.svse.controller;
  2 
  3 import javax.xml.parsers.DocumentBuilderFactory;
  4 import javax.xml.parsers.ParserConfigurationException;
  5 import javax.xml.transform.OutputKeys;
  6 import javax.xml.transform.Transformer;
  7 import javax.xml.transform.TransformerException;
  8 import javax.xml.transform.TransformerFactory;
  9 import javax.xml.transform.dom.DOMSource;
 10 import javax.xml.transform.stream.StreamResult;
 11 
 12 import org.apache.poi.hwpf.HWPFDocument;
 13 import org.apache.poi.hwpf.converter.PicturesManager;
 14 import org.apache.poi.hwpf.converter.WordToHtmlConverter;
 15 import org.apache.poi.hwpf.usermodel.PictureType;
 16 import org.apache.poi.xwpf.converter.core.BasicURIResolver;
 17 import org.apache.poi.xwpf.converter.core.FileImageExtractor;
 18 import org.apache.poi.xwpf.converter.core.FileURIResolver;
 19 import org.apache.poi.xwpf.converter.core.IURIResolver;
 20 import org.apache.poi.xwpf.converter.core.IXWPFConverter;
 21 import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
 22 import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
 23 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 24 /**
 25  * word 转换成html
 26  */
 27 public class TestWordToHtml {
 28 
 29     public static  final String STORAGEPATH="C://works//files//";
 30     public static  final String IP="192.168.30.222";
 31     public static  final String PORT="8010";
 32     public static void main(String[] args) throws IOException, TransformerException, ParserConfigurationException {
 33         TestWordToHtml wt=new TestWordToHtml();
 34         //wt.Word2003ToHtml("甲骨文考证.doc");
 35         wt.Word2007ToHtml("甲骨文考证.docx");
 36 
 37     }
 38       
 39      /**
 40      * 2003版本word转换成html
 41      * @throws IOException
 42      * @throws TransformerException
 43      * @throws ParserConfigurationException
 44      */
 45     public void Word2003ToHtml(String fileName) throws IOException, TransformerException, ParserConfigurationException {
 46        
 47         final String imagepath = STORAGEPATH+"fileImage/";//解析时候如果doc文件中有图片  图片会保存在此路径
 48         final String strRanString=getRandomNum();
 49         String filepath =STORAGEPATH;
 50         String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2003.html";
 51         final String file = filepath + fileName;
 52         InputStream input = new FileInputStream(new File(file));
 53         HWPFDocument wordDocument = new HWPFDocument(input);
 54         WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
 55         //设置图片存放的位置
 56         wordToHtmlConverter.setPicturesManager(new PicturesManager() {
 57             public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
 58                 File imgPath = new File(imagepath);
 59                 if(!imgPath.exists()){//图片目录不存在则创建
 60                     imgPath.mkdirs();
 61                 }
 62                 
 63                 File file = new File(imagepath +strRanString+suggestedName);
 64                 try {
 65                     OutputStream os = new FileOutputStream(file);
 66                     os.write(content);
 67                     os.close();
 68                 } catch (FileNotFoundException e) {
 69                     e.printStackTrace();
 70                 } catch (IOException e) {
 71                     e.printStackTrace();
 72                 }
 73                 
 74                 return  "http://"+IP+":"+PORT+"//uploadFile/fileImage/"+strRanString+suggestedName;
 75                // return imagepath +strRanString+suggestedName;
 76             }
 77         });
 78         
 79         //解析word文档
 80         wordToHtmlConverter.processDocument(wordDocument);
 81         Document htmlDocument = wordToHtmlConverter.getDocument();
 82         
 83         File htmlFile = new File(filepath +strRanString+htmlName);
 84         OutputStream outStream = new FileOutputStream(htmlFile);
 85         
 86 
 87         DOMSource domSource = new DOMSource(htmlDocument);
 88         StreamResult streamResult = new StreamResult(outStream);
 89 
 90         TransformerFactory factory = TransformerFactory.newInstance();
 91         Transformer serializer = factory.newTransformer();
 92         serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
 93         serializer.setOutputProperty(OutputKeys.INDENT, "yes");
 94         serializer.setOutputProperty(OutputKeys.METHOD, "html");
 95         
 96         serializer.transform(domSource, streamResult);
 97         outStream.close();
 98         
 99         System.out.println("生成html文件路径:"+ "http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);
100     }
101 
102     /**
103      * 2007版本word转换成html
104      * @throws IOException
105      */
106     public void Word2007ToHtml(String fileName) throws IOException {
107         
108        final String strRanString=getRandomNum();
109         
110         String filepath = STORAGEPATH+strRanString;
111         String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2007.html";
112         File f = new File(STORAGEPATH+fileName);  
113         if (!f.exists()) {  
114             System.out.println("Sorry File does not Exists!");  
115         } else {  
116             if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {  
117                 try {
118                     // 1) 加载word文档生成 XWPFDocument对象  
119                     InputStream in = new FileInputStream(f);  
120                     XWPFDocument document = new XWPFDocument(in);  
121       
122                     // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)  
123                     File imageFolderFile = new File(filepath);  
124                     XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));  
125                     options.setExtractor(new FileImageExtractor(imageFolderFile));  
126                     options.URIResolver(new IURIResolver() {
127                         public String resolve(String uri) {
128                             //http://192.168.30.222:8010//uploadFile/....
129                             return "http://"+IP+":"+PORT+"//uploadFile/"+strRanString +"/"+ uri;
130                         }
131                     });
132                     
133                     options.setIgnoreStylesIfUnused(false);  
134                     options.setFragment(true);  
135                       
136                     // 3) 将 XWPFDocument转换成XHTML  
137                     OutputStream out = new FileOutputStream(new File(filepath + htmlName));  
138                     IXWPFConverter<XHTMLOptions> converter = XHTMLConverter.getInstance();
139                     converter.convert(document,out, options);
140                     //XHTMLConverter.getInstance().convert(document, out, options);  
141                     System.out.println("html路径:"+"http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);
142                 } catch (Exception e) {
143                     e.printStackTrace();
144                 }
145             
146             } else {  
147                 System.out.println("Enter only MS Office 2007+ files");  
148             }  
149         }  
150     }  
151 
152      /**
153      *功能说明:生成时间戳
154      *创建人:zsq
155      *创建时间:2019年12月7日 下午2:37:09
156      *
157      */
158      public static String getRandomNum(){
159          Date dt = new Date();
160          SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");  
161          String str=sdf.format(dt);
162          return str;
163      }
164      
165    }

二:Java实现将Pdf转换为html

1: 引入依赖

 1 <dependency>
 2             <groupId>net.sf.cssbox</groupId>
 3             <artifactId>pdf2dom</artifactId>
 4             <version>1.7</version>
 5         </dependency> 
 6         <dependency>
 7             <groupId>org.apache.pdfbox</groupId>
 8             <artifactId>pdfbox</artifactId>
 9             <version>2.0.12</version>
10         </dependency>
11         <dependency>
12             <groupId>org.apache.pdfbox</groupId>
13             <artifactId>pdfbox-tools</artifactId>
14             <version>2.0.12</version>
15  </dependency>
16

2:代码Demo

 1 public class PdfToHtml {
 2 
 3   /*
 4     pdf转换html
 5      */
 6     public void pdfToHtmlTest(String inPdfPath,String outputHtmlPath)  {
 7        // String outputPath = "C:\\works\\files\\ZSQ保密知识测试题库.html";
 8     9        //try() 写在()里面会自动关闭流
10         try{
11             BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputHtmlPath)),"utf-8"));
12             //加载PDF文档
13             //PDDocument document = PDDocument.load(bytes);
14             PDDocument document = PDDocument.load(new File(inPdfPath));
15             PDFDomTree pdfDomTree = new PDFDomTree();
16             pdfDomTree.writeText(document,out);
17         } catch (Exception e) {
18             e.printStackTrace();
19         }
20     }
21 
22     public static void main(String[] args) throws IOException {
23         PdfToHtml ph=new PdfToHtml();
24         String pdfPath="C:\\works\\files\\武研中心行政考勤制度.pdf";
25         String outputPath="C:\\works\\files\\武研中心行政考勤制度.html";
26         ph.pdfToHtmlTest(pdfPath,outputPath);
27   }
28 
29 }

三:Java实现将TXT转换为html

 1  /*
 2      * txt文档转html
 3        filePath:txt原文件路径
 4        htmlPosition:转化后生成的html路径
 5     */
 6     public static void txtToHtml(String filePath, String htmlPosition) {
 7         try {
 8             //String encoding = "GBK";
 9             File file = new File(filePath);
10             if (file.isFile() && file.exists()) { // 判断文件是否存在
11                 InputStreamReader read = new InputStreamReader(new FileInputStream(file), "GBK");
12                 // 考虑到编码格式
13                 BufferedReader bufferedReader = new BufferedReader(read);
14                 // 写文件
15                 FileOutputStream fos = new FileOutputStream(new File(htmlPosition));
16                 OutputStreamWriter osw = new OutputStreamWriter(fos, "GBK");
17                 BufferedWriter bw = new BufferedWriter(osw);
18                 String lineTxt = null;
19                 while ((lineTxt = bufferedReader.readLine()) != null) {
20                     bw.write("&nbsp&nbsp&nbsp"+lineTxt + "</br>");
21                 }
22                 bw.close();
23                 osw.close();
24                 fos.close();
25                 read.close();
26             } else {
27                 System.out.println("找不到指定的文件");
28             }
29         } catch (Exception e) {
30             System.out.println("读取文件内容出错");
31             e.printStackTrace();
32         }
33     }