把开发过程中常用的一些代码段做个珍藏,下面的代码是关于C# 用 iTextSharp 将 PDF 转成文本的代码。
using System;
using System.IO;
using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
public class ParsingPDF {
static string PDF;
static string TEXT2;
public void parsePdf(String src, String dest)
{
PdfReader reader = new PdfReader(src);
StreamWriter output = new StreamWriter(new FileStream(dest, FileMode.Create));
int pageCount = reader.NumberOfPages;
for (int pg = 1; pg <= pageCount; pg++)
{
byte[] streamBytes = reader.GetPageContent(pg);
PRTokeniser tokenizer = new PRTokeniser(streamBytes);
while (tokenizer.NextToken())
{
if (tokenizer.TokenType == PRTokeniser.TokType.STRING)
{
output.WriteLine(tokenizer.StringValue);
}
}
}