java实现爬虫技术，读取txt，word，excel，ppt，pdf，html等格式的文件

本文链接：https://blog.csdn.net/sheng_xinjun/article/details/52925484

最近跟我同事一起做的项目要求读取txt,word,excel,ppt,pdf,html中的内容，不多说，先把代码贴出来，之后有时间再来做详细的解读。

这是读取txt文件

/**
	 * 获取txt的文件内容 新建的默认格式 ，其它三种格式会乱码
	 * 
	 * @param txtFile
	 * @return
	 */
	public String GetTxtContent(File txtFile) {
		BufferedReader reader = null;
		
		String tempString = null;
		StringBuffer contents = new StringBuffer();
		try {
			reader = new BufferedReader(new FileReader(txtFile));
			while ((tempString = reader.readLine()) != null) {
				contents.append(tempString);
			}
			reader.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (reader != null) {
				try {
					reader.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		return contents.toString().trim();
	}

<h1>读取ppt</h1>	/**
	 * 读取PPT的内容
	 * 
	 * @param excleFile
	 * @return
	 */
	public String GetPPTContent(File excleFile) { 
		StringBuffer contents = new StringBuffer("");// 文档内容
		InputStream is = null;
		SlideShow ppt = null;
		try {
			is = new FileInputStream(excleFile);
			ppt = new SlideShow(new HSLFSlideShow(is));
		} catch (FileNotFoundException e1) {
			e1.printStackTrace();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		Slide[] slides = ppt.getSlides();

		for (int i = 0; i < slides.length; i++) {
			TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容，建立TextRun
			for (int j = 0; j < t.length; j++) {
				contents.append(t[j].getText());// 这里会将文字内容加到content中去
			}
		}
		if (is != null) {
			try {
				is.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return contents.toString().trim();
	}

<h1>读取excel</h1>	/**
	 * 获取2007excle的内容
	 * 
	 * @param exclexlsxFile
	 * @return
	 */
	public String GetExclexlsxContent(File exclexlsxFile) {
		StringBuffer content = null;
		XSSFWorkbook workbook = null;
		InputStream in = null;
		try {
			in = new FileInputStream(exclexlsxFile);
			content = new StringBuffer();
			workbook = new XSSFWorkbook(in);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}

		for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
			XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
			content.append("\n");
			if (null == aSheet) {
				continue;
			}
			for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {
				content.append("\n");
				XSSFRow aRow = aSheet.getRow(rowNum);
				if (null == aRow) {
					continue;
				}

				for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {
					XSSFCell aCell = aRow.getCell(cellNum);
					if (null == aCell) {
						continue;
					}
					if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
						content.append(aCell.getRichStringCellValue()
								.getString());
					} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
						boolean b = HSSFDateUtil.isCellDateFormatted(aCell);
						if (b) {
							Date date = aCell.getDateCellValue();
							SimpleDateFormat df = new SimpleDateFormat(
									"yyyy-MM-dd HH:mm:ss");
							content.append(df.format(date));
						}
					}
				}
			}
		}
		if (in != null) {
			try {
				in.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

		return content.toString().trim();
	}
	/**
	 * 读取excle的内容
	 * 
	 * @param excleFile
	 * @return
	 */
	public String GetExcleContent(File excleFile) {
		StringBuffer content = null;
		HSSFWorkbook workbook = null;
		InputStream in = null;
		try {
			in = new FileInputStream(excleFile);
			content = new StringBuffer();
			workbook = new HSSFWorkbook(in);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}

		for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
			HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
			content.append("\n");
			if (null == aSheet) {
				continue;
			}
			for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {
				content.append("\n");
				HSSFRow aRow = aSheet.getRow(rowNum);
				if (null == aRow) {
					continue;
				}

				for (int cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {
					HSSFCell aCell = aRow.getCell(cellNum);
					if (null == aCell) {
						continue;
					}

					if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
						content.append(aCell.getRichStringCellValue()
								.getString());
					} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
						boolean b = HSSFDateUtil.isCellDateFormatted(aCell);
						if (b) {
							Date date = aCell.getDateCellValue();
							SimpleDateFormat df = new SimpleDateFormat(
									"yyyy-MM-dd HH:mm:ss");
							content.append(df.format(date));
						}
					}
				}
			}
		}
		if (in != null) {
			try {
				in.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

		return content.toString().trim();
	}

<span style="font-size:48px;">读取word</span>
	/**
	 * 获取word的内容
	 * 
	 * @param wordPath
	 *            文件
	 * @return word的内容
	 */
	@SuppressWarnings("resource")
	public  String GetWordContent(File wordFile) {
		String strContent = "";
		FileInputStream in=null;
		try {
			in = new FileInputStream(wordFile);
			WordExtractor text = new WordExtractor(in);
			strContent = text.getText();
		} catch (Exception e) {
			e.printStackTrace();
		}finally{
			if(in!=null){
				try {
					in.close();
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
		}

		return strContent.trim();
	}
	/**
	 * 获取word2007的内容
	 * 
	 * @param word2007Path
	 * @return
	 * @throws Exception
	 */
	public String GetWordDocxContent(File wordDocxFile) {
		POIXMLTextExtractor extractor;
		String text2007 = "";
		try {
			OPCPackage opcPackage = POIXMLDocument.openPackage(wordDocxFile
					.getPath());
			extractor = new XWPFWordExtractor(opcPackage);
			text2007 = extractor.getText();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (XmlException e) {
			e.printStackTrace();
		} catch (OpenXML4JException e) {
			e.printStackTrace();
		}
		return text2007.trim();
	}

<span style="font-size:48px;">读取pdf</span>

	/**
	 * 读取PDF文字的内容
	 * 
	 * @param pdfPath
	 *            pdf
	 * @return 返回pdf文件的内容
	 */
	public String GetPDFContent(File pdfFile) {
		String content = "";
		FileInputStream is = null;
		PDDocument doc = null;
		try {
			is = new FileInputStream(pdfFile);
			PDFParser parser = new PDFParser(is);
			parser.parse();
			doc = parser.getPDDocument();
			PDFTextStripper stripper = new PDFTextStripper();
			content = stripper.getText(doc);
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (is != null) {
				try {
					is.close();
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
			if (doc != null) {
				try {
					doc.close();
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
		}
		return content.trim();
	}

<span style="font-size:48px;">读取html</span>
	/**
	 * 读取网页纯文本内容用来存储索引方法*/
	public  String GetHTML(String url) throws ParserException{
		Parser parser = new Parser(url);
		StringBean sb=new StringBean();
		//設置不需要頁面的鏈接信息
		sb.setLinks(false);
		//設置將不間斷空格由正規空格替代
		sb.setReplaceNonBreakingSpaces(true);
		//設置一系列空格由單一空格代替
		sb.setCollapse(true);
		parser.visitAllNodesWith(sb);
		return sb.getStrings().trim();
	}
	/**@param filePath 
	 * 文件上傳路徑
	 * 处理附件方法 获得JSON数组
	 * @throws Exception */
	public String HandleFj(String param,IService service,String filePath) throws Exception{
		JSONArray json=null;
		ArrayList<IEntity>list=null;
		String sql="";
		String fjtotalpath="";
		try {
			json=JSONArray.fromObject(DataObject.getObjectValue("param"));
		} catch (Exception e) {
			e.printStackTrace();
			return "";
		}
		if(!StringHelper.isNullOrEmpty(json)){
			StringBuffer fjcontenttotal=new StringBuffer();
			for(int i=0;i<json.length();i++){
				String fileid=json.getJSONObject(i).getString("id");//拿到fileid
				String name=json.getJSONObject(i).getString("name");
				if(!StringHelper.isNullOrEmpty(fileid)&&!StringHelper.isNullOrEmpty(name)){
					sql="select t.localpath from t_srffile t where t.file_id='"+fileid+"'";
					try {
						list=service.selectRaw(sql, null);
					} catch (Exception e) {
						e.printStackTrace();
					}
					for(IEntity o:list){
						String location=DataObject.getStringValue(o.get("location"));
						fjtotalpath=filePath+location;
						fjcontenttotal.append(this.GetFileContent(fjtotalpath));
					}
				}
				return fjcontenttotal.toString();
			}
		}
		return "";
	}