Apache PDFbox是一个开源的、基于Java的、支持PDF文档生成的工具库,它可以用于创建新的PDF文档,修改现有的PDF文档,还可以从PDF文档中提取所需的内容。Apache PDFBox还包含了数个命令行工具。
Apache PDFBox主要有以下特征:
PDF读取、创建、打印、转换、验证、合并分割等特征。
所需jar包
<dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId><version>2.0.12</version></dependency><dependency><groupId>org.apache.pdfbox</groupId><artifactId>fontbox</artifactId><version>2.0.12</version></dependency>
文本内容提取
//public static void PdfReader(String filePath){File pdfFile = new File(filePath);PDDocument document = null;try {// PDF文件加载方式一/*InputStream input = null;input = new FileInputStream( pdfFile );//加载 pdf 文档PDFParser parser = new PDFParser(new RandomAccessBuffer(input));parser.parse();document = parser.getPDDocument();*/// PDF文件加载方式二document=PDDocument.load(pdfFile);// 获取页码int pages = document.getNumberOfPages();// 读文本内容PDFTextStripper stripper=new PDFTextStripper();// 设置按顺序输出stripper.setSortByPosition(true);stripper.setStartPage(1);stripper.setEndPage(pages);String content = stripper.getText(document);System.out.println(content);} catch(Exception e) {e.printStackTrace();}}
图片提取
此方法可以取出源PDF中图片对象PDImageXObject,然后可以对该对象进行相关处理,本代码实现了将提取出来的每一个图片对象,插入到一个空白的PDF文档中。
public static void readImage(String filePath) throws IOException {// 待解析PDFFile pdfFile = new File(filePath);// 文件所在目录String fileDirectory = pdfFile.getParent();// 空白PDF - 内部没有任何内容的PDFFile pdfFileOut = new File(fileDirectory+"\\testout.pdf");PDDocument document = null;PDDocument documentOut = null;try {document = PDDocument.load(pdfFile);documentOut = PDDocument.load(pdfFileOut);} catch (IOException e) {e.printStackTrace();}int pages_size = document == null ? 0:document.getNumberOfPages();int j=0;for(int i=0;i<pages_size;i++) {PDPage page = document.getPage(i);PDPage pageOut = documentOut ==null?null:documentOut.getPage(0);PDResources resources = page.getResources();Iterable xobjects = resources.getXObjectNames();if (xobjects != null) {for (Object xobject : xobjects) {COSName key = (COSName) xobject;if (resources.isImageXObject(key)) {try {PDImageXObject image = (PDImageXObject) resources.getXObject(key);// 将PDF文档中的图片 分别存到一个空白PDF中。PDPageContentStream contentStream = new PDPageContentStream(documentOut, pageOut, PDPageContentStream.AppendMode.APPEND, true);float scale = 1f;contentStream.drawImage(image, 20, 20, image.getWidth() * scale, image.getHeight() * scale);contentStream.close();documentOut.save(fileDirectory + "/test" + j + ".pdf");System.out.println(image.getSuffix() + "," + image.getHeight() + "," + image.getWidth());} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}//image countj++;}}}}}
PDF文件打印
/*** 根据文档地址打印PDF文件* @param filePath 文件地址* @param printerName* @throws Exception*/public static void PdfPrint(File file , String printerName) throws Exception {PDDocument document = null;try {document = PDDocument.load(file);PrinterJob printJob = PrinterJob.getPrinterJob();printJob.setJobName(file.getName());// 获取默认打印机 PrintService printService = PrintServiceLookup.lookupDefaultPrintService();if (printerName != null) {// 查找并设置打印机//获得本台电脑连接的所有打印机PrintService[] printServices = PrinterJob.lookupPrintServices();if(printServices == null || printServices.length == 0) {System.out.print("打印失败,未找到可用打印机,请检查。");return ;}PrintService printService = null;//匹配指定打印机for (int i = 0;i < printServices.length; i++) {//System.out.println(printServices[i].getName());if (printServices[i].getName().contains(printerName)) {printService = printServices[i];break;}}if(printService!=null){printJob.setPrintService(printService);}else{System.out.print("打印失败,未找到名称为" + printerName + "的打印机,请检查。");return ;}}//设置纸张及缩放PDFPrintable pdfPrintable = new PDFPrintable(document, Scaling.ACTUAL_SIZE);//设置多页打印Book book = new Book();PageFormat pageFormat = new PageFormat();//设置打印方向pageFormat.setOrientation(PageFormat.PORTRAIT);//纵向pageFormat.setPaper(getPaper());//设置纸张book.append(pdfPrintable, pageFormat, document.getNumberOfPages());printJob.setPageable(book);printJob.setCopies(1);//设置打印份数//添加打印属性HashPrintRequestAttributeSet pars = new HashPrintRequestAttributeSet();pars.add(Sides.DUPLEX); //设置单双页printJob.print(pars);}finally {if (document != null) {try {document.close();} catch (IOException e) {e.printStackTrace();}}}}/*** 根据URL地址打印PDF文件* @param urlStr url地址* @param printerName 打印机名字* @param orientation 1=竖向 2=横向* @throws Exception*/public static void PdfPrintByUrl(String urlStr ,String printerName ,Integer orientation) throws Exception {PDDocument document = null;try {URL url = new URL(urlStr);HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();httpURLConnection.setRequestProperty("Charset", "UTF-8");httpURLConnection.setConnectTimeout(5*1000);httpURLConnection.connect();// 构造待打印的文件流InputStream fis=httpURLConnection.getInputStream();document = PDDocument.load(fis);PrinterJob printJob = PrinterJob.getPrinterJob();// 查找并设置打印机//获得本台电脑连接的所有打印机PrintService[] printServices = PrinterJob.lookupPrintServices();if(printServices == null || printServices.length == 0) {System.out.print("打印失败,未找到可用打印机,请检查。");return ;}PrintService printService = null;//匹配指定打印机for (int i = 0;i < printServices.length; i++) {//System.out.println(printServices[i].getName());if (printServices[i].getName().contains(printerName)) {printService = printServices[i];break;}}if(printService!=null){printJob.setPrintService(printService);}else{System.out.print("打印失败,未找到名称为" + printerName + "的打印机,请检查。");return ;}//设置纸张及缩放PDFPrintable pdfPrintable = new PDFPrintable(document, Scaling.ACTUAL_SIZE);//设置多页打印Book book = new Book();PageFormat pageFormat = new PageFormat();//设置打印方向 PORTRAIT 竖向 REVERSE_LANDSCAPE 横向pageFormat.setOrientation(orientation == 1 ? PageFormat.PORTRAIT : PageFormat.REVERSE_LANDSCAPE);//设置纸张pageFormat.setPaper(getPaper());book.append(pdfPrintable, pageFormat, document.getNumberOfPages());printJob.setPageable(book);//设置打印份数printJob.setCopies(1);//添加打印属性HashPrintRequestAttributeSet pars = new HashPrintRequestAttributeSet();//设置单双页pars.add(Sides.DUPLEX);printJob.print(pars);}finally {if (document != null) {try {document.close();} catch (IOException e) {e.printStackTrace();}}}}public static Paper getPaper() {Paper paper = new Paper();// 默认为A4纸张,对应像素宽和高分别为 595, 842int width = 595;int height = 842;// 设置边距,单位是像素,10mm边距,对应 28pxint marginLeft = 10;int marginRight = 0;int marginTop = 10;int marginBottom = 0;paper.setSize(width, height);// 下面一行代码,解决了打印内容为空的问题paper.setImageableArea(marginLeft, marginRight, width - (marginLeft + marginRight), height - (marginTop + marginBottom));return paper;}
PdfBoxUtils文件完整代码
package com.example.demo.common;import org.apache.pdfbox.cos.COSName;import org.apache.pdfbox.pdmodel.PDDocument;import org.apache.pdfbox.pdmodel.PDPage;import org.apache.pdfbox.pdmodel.PDPageContentStream;import org.apache.pdfbox.pdmodel.PDResources;import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;import org.apache.pdfbox.printing.PDFPrintable;import org.apache.pdfbox.printing.Scaling;import org.apache.pdfbox.text.PDFTextStripper;import javax.print.PrintService;import javax.print.PrintServiceLookup;import javax.print.attribute.HashPrintRequestAttributeSet;import javax.print.attribute.standard.Sides;import java.awt.print.Book;import java.awt.print.PageFormat;import java.awt.print.Paper;import java.awt.print.PrinterJob;import java.io.*;import java.net.HttpURLConnection;import java.net.URL;public class PDFBoxUtils {public static void main(String[] args) throws Exception {String filePath = "C:\\Users\\92384\\Documents\\test.pdf";//文件路径//PdfReader(filePath);readImage(filePath);String printerName = "Microsoft Print to PDF";//打印机名包含字串//PdfPrint(filePath,printerName);String urlStr= "https://dlj.51fapiao.cn/dlj/v7/27a91fc541ac427967ef6d8f5019a98928f255";//PdfPrintByUrl(urlStr,printerName,2);}public static void PdfReader(String filePath){File pdfFile = new File(filePath);PDDocument document = null;try {// PDF文件加载方式一/*InputStream input = null;input = new FileInputStream( pdfFile );//加载 pdf 文档PDFParser parser = new PDFParser(new RandomAccessBuffer(input));parser.parse();document = parser.getPDDocument();*/// PDF文件加载方式二document=PDDocument.load(pdfFile);// 获取页码int pages = document.getNumberOfPages();// 读文本内容PDFTextStripper stripper=new PDFTextStripper();// 设置按顺序输出stripper.setSortByPosition(true);stripper.setStartPage(1);stripper.setEndPage(pages);String content = stripper.getText(document);System.out.println(content);} catch(Exception e) {e.printStackTrace();}}public static void readImage(String filePath) throws IOException {// 待解析PDFFile pdfFile = new File(filePath);// 文件所在目录String fileDirectory = pdfFile.getParent();// 空白PDF - 内部没有任何内容的PDFFile pdfFileOut = new File(fileDirectory+"\\testout.pdf");PDDocument document = null;PDDocument documentOut = null;try {document = PDDocument.load(pdfFile);documentOut = PDDocument.load(pdfFileOut);} catch (IOException e) {e.printStackTrace();}int pages_size = document == null ? 0:document.getNumberOfPages();int j=0;for(int i=0;i<pages_size;i++) {PDPage page = document.getPage(i);PDPage pageOut = documentOut ==null?null:documentOut.getPage(0);PDResources resources = page.getResources();Iterable xobjects = resources.getXObjectNames();if (xobjects != null) {for (Object xobject : xobjects) {COSName key = (COSName) xobject;if (resources.isImageXObject(key)) {try {PDImageXObject image = (PDImageXObject) resources.getXObject(key);// 将PDF文档中的图片 分别存到一个空白PDF中。PDPageContentStream contentStream = new PDPageContentStream(documentOut, pageOut, PDPageContentStream.AppendMode.APPEND, true);float scale = 1f;contentStream.drawImage(image, 20, 20, image.getWidth() * scale, image.getHeight() * scale);contentStream.close();documentOut.save(fileDirectory + "/test" + j + ".pdf");System.out.println(image.getSuffix() + "," + image.getHeight() + "," + image.getWidth());} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}//image countj++;}}}}}/*** 根据文档地址打印PDF文件* @param filePath 文件地址* @param printerName* @throws Exception*/public static void PdfPrint(String filePath , String printerName) throws Exception {File file = null;PDDocument document = null;try {file = new File(filePath);document = PDDocument.load(file);PrinterJob printJob = PrinterJob.getPrinterJob();printJob.setJobName(file.getName());// 获取默认打印机 PrintService printService = PrintServiceLookup.lookupDefaultPrintService();if (printerName != null) {// 查找并设置打印机//获得本台电脑连接的所有打印机PrintService[] printServices = PrinterJob.lookupPrintServices();if(printServices == null || printServices.length == 0) {System.out.print("打印失败,未找到可用打印机,请检查。");return ;}PrintService printService = null;//匹配指定打印机for (int i = 0;i < printServices.length; i++) {//System.out.println(printServices[i].getName());if (printServices[i].getName().contains(printerName)) {printService = printServices[i];break;}}if(printService!=null){printJob.setPrintService(printService);}else{System.out.print("打印失败,未找到名称为" + printerName + "的打印机,请检查。");return ;}}//设置纸张及缩放PDFPrintable pdfPrintable = new PDFPrintable(document, Scaling.ACTUAL_SIZE);//设置多页打印Book book = new Book();PageFormat pageFormat = new PageFormat();//设置打印方向pageFormat.setOrientation(PageFormat.PORTRAIT);//纵向pageFormat.setPaper(getPaper());//设置纸张book.append(pdfPrintable, pageFormat, document.getNumberOfPages());printJob.setPageable(book);printJob.setCopies(1);//设置打印份数//添加打印属性HashPrintRequestAttributeSet pars = new HashPrintRequestAttributeSet();pars.add(Sides.DUPLEX); //设置单双页printJob.print(pars);}finally {if (document != null) {try {document.close();} catch (IOException e) {e.printStackTrace();}}}}/*** 根据URL地址打印PDF文件* @param urlStr url地址* @param printerName 打印机名字* @param orientation 1=竖向 2=横向* @throws Exception*/public static void PdfPrintByUrl(String urlStr ,String printerName ,Integer orientation) throws Exception {PDDocument document = null;try {URL url = new URL(urlStr);HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();httpURLConnection.setRequestProperty("Charset", "UTF-8");httpURLConnection.setConnectTimeout(5*1000);httpURLConnection.connect();// 构造待打印的文件流InputStream fis=httpURLConnection.getInputStream();document = PDDocument.load(fis);PrinterJob printJob = PrinterJob.getPrinterJob();// 查找并设置打印机//获得本台电脑连接的所有打印机PrintService[] printServices = PrinterJob.lookupPrintServices();if(printServices == null || printServices.length == 0) {System.out.print("打印失败,未找到可用打印机,请检查。");return ;}PrintService printService = null;//匹配指定打印机for (int i = 0;i < printServices.length; i++) {//System.out.println(printServices[i].getName());if (printServices[i].getName().contains(printerName)) {printService = printServices[i];break;}}if(printService!=null){printJob.setPrintService(printService);}else{System.out.print("打印失败,未找到名称为" + printerName + "的打印机,请检查。");return ;}//设置纸张及缩放PDFPrintable pdfPrintable = new PDFPrintable(document, Scaling.ACTUAL_SIZE);//设置多页打印Book book = new Book();PageFormat pageFormat = new PageFormat();//设置打印方向 PORTRAIT 竖向 REVERSE_LANDSCAPE 横向pageFormat.setOrientation(orientation == 1 ? PageFormat.PORTRAIT : PageFormat.REVERSE_LANDSCAPE);//设置纸张pageFormat.setPaper(getPaper());book.append(pdfPrintable, pageFormat, document.getNumberOfPages());printJob.setPageable(book);//设置打印份数printJob.setCopies(1);//添加打印属性HashPrintRequestAttributeSet pars = new HashPrintRequestAttributeSet();//设置单双页pars.add(Sides.DUPLEX);printJob.print(pars);}finally {if (document != null) {try {document.close();} catch (IOException e) {e.printStackTrace();}}}}public static Paper getPaper() {Paper paper = new Paper();// 默认为A4纸张,对应像素宽和高分别为 595, 842int width = 595;int height = 842;// 设置边距,单位是像素,10mm边距,对应 28pxint marginLeft = 10;int marginRight = 0;int marginTop = 10;int marginBottom = 0;paper.setSize(width, height);// 下面一行代码,解决了打印内容为空的问题paper.setImageableArea(marginLeft, marginRight, width - (marginLeft + marginRight), height - (marginTop + marginBottom));return paper;}}
