Java爬虫(获取验证码爬取网页信息)

Java爬虫(获取验证码爬取网页信息) 用到了图片解析Test4j需要下载tessdata

maven pom.xml

 <dependencies>
  		<dependency>
 		   <groupId>org.apache.httpcomponents</groupId>
    	   <artifactId>httpclient</artifactId>
        <version>4.5.6</version>
        </dependency>
		<dependency>
		    <groupId>org.jsoup</groupId>
		    <artifactId>jsoup</artifactId>
		    <version>1.11.3</version>
		</dependency>
		<dependency>
		    <groupId>net.sf.json-lib</groupId>
		    <artifactId>json-lib</artifactId>
		    <version>2.0</version>
		   <classifier>jdk15</classifier>
		</dependency>
		 <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.15-beta2</version>
        </dependency>
 
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.15-beta2</version>
        </dependency>
       <dependency>
            <groupId>net.java.dev.jna</groupId>
            <artifactId>jna</artifactId>
            <version>4.1.0</version>
        </dependency>
        <dependency>
            <groupId>net.sourceforge.tess4j</groupId>
            <artifactId>tess4j</artifactId>
            <version>2.0.1</version>
            <exclusions>
                <exclusion>
                    <groupId>com.sun.jna</groupId>
                    <artifactId>jna</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
		<dependency>
		    <groupId>org.apache.httpcomponents</groupId>
		    <artifactId>httpclient</artifactId>
		    <version>4.3.6</version>
		</dependency>
		 <dependency>
		  <groupId>net.sourceforge.tess4j</groupId> 
		  <artifactId>tess4j</artifactId> 
		  <version>3.2.1</version> 
		</dependency>
		<dependency>
		   <groupId>ch.qos.logback</groupId>
		   <artifactId>logback-classic</artifactId>
		   <version>1.2.3</version>
		</dependency>
		<dependency>
		   <groupId>ch.qos.logback</groupId>
		   <artifactId>logback-core</artifactId>
		   <version>RELEASE</version>
		</dependency>
		<dependency>
		    <groupId>org.json</groupId>
		    <artifactId>json</artifactId>
		    <version>20180130</version>
		</dependency>
  </dependencies>

创建一个PictureAddressUtil工具类 获取图片地址


import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class PictureAddressUtil {
	
	public List<String> filePath(String content) {
		List<String> srcList = new ArrayList<String>(); //用来存储获取到的图片地址
		Pattern p = Pattern.compile("<(img|IMG)(.*?)(>|></img>|/>)");//匹配字符串中的img标签
		Matcher matcher = p.matcher(content);
		boolean hasPic = matcher.find();
		if(hasPic == true)//判断是否含有图片
		{
			while(hasPic) //如果含有图片,那么持续进行查找,直到匹配不到
			{
				String group = matcher.group(2);//获取第二个分组的内容,也就是 (.*?)匹配到的
				Pattern srcText = Pattern.compile("(src|SRC)=(\"|\')(.*?)(\"|\')");//匹配图片的地址
				Matcher matcher2 = srcText.matcher(group);
				if( matcher2.find() ) 
				{
					srcList.add( matcher2.group(3) );//把获取到的图片地址添加到列表中
				}
				hasPic = matcher.find();//判断是否还有img标签
			}		
		}
		return srcList;
	}
}

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.CookieStore;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
public class HuaNanShiFanStudent {
	
	public static void main(String[] args) throws Exception {
		  String name = "学生信息列表";
		  // 创建excel
	      HSSFWorkbook wk = new HSSFWorkbook();
	       // 创建一张工作表
	       HSSFSheet sheet = wk.createSheet();
	       // 2
	       sheet.setColumnWidth(0, 5000);
	       HSSFRow row = sheet.createRow(0);
	       // 创建第一行的第一个单元格
	       // 想单元格写值
	       HSSFCell cell = row.createCell((short) 0);
	       cell.setCellValue("学号");
	       cell = row.createCell((short)1);
	       cell.setCellValue("姓名");
	       
	       HuaNanShiFanStudent fanStudent = new HuaNanShiFanStudent();
	       List<TR>  trs =  fanStudent.HuaNanShiFanStudent_List();
	       for (int i=0;i<trs.size();i++) {
	    	   row = sheet.createRow(i+1);
	    	   row.createCell(0).setCellValue(trs.get(i).getTd0());
	    	   row.createCell(1).setCellValue(trs.get(i).getTd1()); 
	       }
	       wk.write(new FileOutputStream(new File("C:\\Users\\啊\\Desktop\\"+name+".xls")));
         wk.close();
	      System.out.println("运行成功");
	}
	
	
	
	
	 public List<TR> HuaNanShiFanStudent_List() throws Exception {
		 List<TR> trs = new ArrayList<TR>(); 
		//用来存取cookies信息的变量
	    	CookieStore store;
	         /**
	          * 请求第一次页面
	          */
	    	//第一次请求(登陆的请求)
	    	DefaultHttpClient client1 = new DefaultHttpClient();
	        HttpPost request1 = new HttpPost("http://www.baidu.com/");
	        store = client1.getCookieStore();
	        HttpResponse response1 = client1.execute(request1);  
	        if (response1.getStatusLine().getStatusCode() == 200) {  
	        	HttpEntity entity1 = response1.getEntity();
	        	//获取
	            String result1 = EntityUtils.toString(entity1); 
	          
	            //使用jsoup 进行语言转换
	            Document doc1 = Jsoup.parse(result1);
	           //获取table表格
	        	String string = doc1.getElementById("imgCode").toString();
	          
	       
	           //爬取验证码  图片/sso/authimg
	           PictureAddressUtil addressUtil = new PictureAddressUtil();
	           List<String> list = addressUtil.filePath(string);
	        	
	           /**
	            * 通过验证码   下载下来   并用tess4j图片识别其中的验证码
	            */
	           DefaultHttpClient client2 = new DefaultHttpClient();
	           HttpPost request2 = new HttpPost("http://www.baidu.com/"+list.get(0)+""); 
	           client2.setCookieStore(store);
		        HttpResponse response2 = client2.execute(request2);  
	            downloadJPG(response2,"1.jpg");
	   		    String code = getImgContent("1.jpg");
	   		    
	   		    System.out.println("验证码 = " + code);
	            System.out.println("==============================="); 
	   		    /**
	   		     * 登录请求
	   		     */
	   		    DefaultHttpClient client3 = new DefaultHttpClient();
	            HttpPost request3 = new HttpPost("http://www.baidu.com?username=用户名&password=密码&authCode="+验证码+"");
	            client3.setCookieStore(store);
	            HttpResponse response3 = client3.execute(request3);  
	            if (response3.getStatusLine().getStatusCode() == 200) {  
	            	HttpEntity entity3 = response3.getEntity();
	            	//获取
	                String result3 = EntityUtils.toString(entity3);
	               
	                System.out.println("Response content: " + result3);  
	                System.err.println("===============================");
	                
	              /**
	               * 请求
	               */
	                DefaultHttpClient client4 = new DefaultHttpClient();
		            HttpPost request4 = new HttpPost("接口");       
		            client4.setCookieStore(store);
		            HttpResponse response4 = client4.execute(request4);  
		            if (response4.getStatusLine().getStatusCode() == 200) {  
		                
		            	HttpEntity entity4 = response4.getEntity();
		            	//获取
		                String json4 = EntityUtils.toString(entity4); 
		              /* System.err.println("===============================");  
		                System.out.println("Response content: " + json4);  
		                System.out.println("===============================");  */
		             //  JSONArray jsonArray4 = new JSONArray();
		                JSONObject jsonObject = JSONObject.fromObject(json4);
		               
		                // System.out.println(str);   
		                int totalCount = Integer.parseInt(jsonObject.getString("totalCount"));
		                System.out.println("totalCount========"+totalCount);
		                
		                
		              //循环查询的次数
		            	int Index ;
		            	int  remaInder = totalCount%1000;
		            	if(remaInder>0) {
		            		Index = (totalCount/1000)+1;
		            	}else {
		            		Index = totalCount/1000;
		            	}
		            	if(Index>0) {
		            		DefaultHttpClient client5 = new DefaultHttpClient();
		            		HttpPost request5 = new HttpPost();
		            	    int start = 0;
		            	    int limit = 1000;
		  	                
		  	              for (int n = 0; n < Index; n++) {
		  	            	  System.out.println(n);
		  	            	request5.setURI(new URI( "http://www.baidu.com?&start="+start+"&limit="+limit+"&sort=id&dir=DESC"));       
		  		            client5.setCookieStore(store);
		  		            HttpResponse response5 = client5.execute(request5);  
		  		            if (response5.getStatusLine().getStatusCode() == 200) {  
		  		                
		  		            	HttpEntity entity5 = response5.getEntity();
		  		            	//获取
		  		                String json5 = EntityUtils.toString(entity5); 
		  		                JSONObject jsonObject5 = JSONObject.fromObject(json5);
		                
		                        JSONArray data = jsonObject5.getJSONArray("models");
		               
		                  
		                    for (int i = 0; i<data.length();i++){
		                        JSONObject jsonObject10 = data.getJSONObject(i);
		                        String regNo = jsonObject10.getString("regNo");
		                        String trueName = jsonObject10.getString("trueName");
		                        String cardNo = jsonObject10.getJSONObject("prStudentInfo").getString("cardNo");
		                        String genderNanme =  jsonObject10.getJSONObject("prStudentInfo").getString("gender");
		                        String peSite = jsonObject10.getJSONObject("peSite").getString("name");
		                        String peGrade = jsonObject10.getJSONObject("peGrade").getString("name");
		                        String peMajor = jsonObject10.getJSONObject("peMajor").getString("name");
		                        String peEdutype = jsonObject10.getJSONObject("peMajor").getString("name");
		                        String enumConstByFlagMajorType = jsonObject10.getJSONObject("enumConstByFlagMajorType").getString("name");//
		                        String enumConstByFlagStudentStatus = jsonObject10.getJSONObject("enumConstByFlagStudentStatus").getString("name");
 
		                        
		                        TR tr = new TR();
		                        tr.setTd0(regNo);
		                        tr.setTd1(trueName);
		                        tr.setTd2(cardNo);
		                        tr.setTd3(genderNanme);
		                        tr.setTd4(peSite);
		                        tr.setTd5(peGrade);
		                        tr.setTd6(peMajor);
		                        tr.setTd7(peEdutype);
		                        tr.setTd8(enumConstByFlagMajorType);
		                        tr.setTd9(enumConstByFlagStudentStatus);
		                        trs.add(tr);
		                       //System.out.println(trueName + peGradeName + peEdutypeNanme + peMajorName + inputDate + feeAmount);
		                    }
		            	
		  		            }
		  		          start=start+1000; 
		  		            
		  	              }
		  	              }
		            }
	                
	            }
	   		   
	         }
			return trs;
		
		} 
		
	
	public static void downloadJPG(HttpResponse httpResponse,String fileName) throws IOException {
		InputStream input = httpResponse.getEntity().getContent();
		OutputStream output = new FileOutputStream(new File(fileName));
		IOUtils.copy(input, output);
		if (output != null) {
		output.close();
		}
		output.flush();
		}
	public static String getImgContent(String imgUrl) {
		String content = "";
		File imageFile = new File(imgUrl);
		//读取图片数字
		ITesseract instance = new Tesseract();
        instance.setDatapath("C:\\Program Files (x86)\\Tesseract-OCR\\tessdata");
		//File tessDataFolder = LoadLibs.extractTessResources();
		//C:\Users\啊\AppData\Local\Temp\tess4j\tessdata
		instance.setLanguage("chi_sim");//英文库识别数字比较准确
		//instance.setDatapath(tessDataFolder.getAbsolutePath());
		try {
		content = instance.doOCR(imageFile).replace("\n", "");
		System.out.println(content);
		} catch (TesseractException e) {
		System.err.println(e.getMessage());
		}
		return content;
		}
		

}
  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值