“`
using org.pdfbox.pdmodel;
using org.pdfbox.util;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using iTextSharp;
using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
namespace WindowsFormsApplication1
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
string path = "d:\\A.pdf";
FileInfo file = new FileInfo(path);
FileInfo txtfile = new FileInfo("d:\\test.txt");
pdf2txt(file, txtfile);
OnCreated(path);
;
}
//PDFBox-0.7.3
public string pdf2txt(FileInfo file, FileInfo txtfile)
{
string path = "d:\\B.pdf";
PDDocument doc = PDDocument.load(file.FullName);
PDFTextStripper pdfStripper = new PDFTextStripper();
FileStream fs = new FileStream(path, FileMode.Open);
byte[] buffer = new byte[8];
fs.Read(buffer, 0, 8);
fs.Close();
if (buffer[7] == 52)//%PDF-1.4
{
return "PDF版本太低,无法读出.";//%PDF-1.7可读
}
string text = pdfStripper.getText(doc);
StreamWriter swPdfChange = new StreamWriter(txtfile.FullName, false, Encoding.GetEncoding("utf-8"));
swPdfChange.Write(text);
swPdfChange.Close();
return "";
}
//itextsharp
private void OnCreated(string filepath)
{
try
{
string pdffilename = filepath;
PdfReader pdfReader = new PdfReader(pdffilename);
int numberOfPages = pdfReader.NumberOfPages;
string text = string.Empty;
for (int i = 1; i <= numberOfPages; ++i)
{
byte[] bufferOfPageContent = pdfReader.GetPageContent(i);
//text += System.Text.Encoding.UTF8.GetString(bufferOfPageContent);
text += PdfTextExtractor.GetTextFromPage(pdfReader, numberOfPages);
}
pdfReader.Close();
FileInfo txtfile = new FileInfo("d:\\test.txt");
StreamWriter swPdfChange = new StreamWriter(txtfile.FullName, false, Encoding.GetEncoding("gb2312"));
swPdfChange.Write(text);
swPdfChange.Close();
}
catch (Exception ex)
{
}
}
}
}
引用
“`