Java爬虫(获取验证码爬取网页信息) 用到了图片解析Test4j需要下载tessdata
maven pom.xml
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.0</version>
<classifier>jdk15</classifier>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.15-beta2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.15-beta2</version>
</dependency>
<dependency>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>2.0.1</version>
<exclusions>
<exclusion>
<groupId>com.sun.jna</groupId>
<artifactId>jna</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.6</version>
</dependency>
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.3</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-core</artifactId>
<version>RELEASE</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20180130</version>
</dependency>
</dependencies>
创建一个PictureAddressUtil工具类 获取图片地址
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class PictureAddressUtil {
public List<String> filePath(String content) {
List<String> srcList = new ArrayList<String>(); //用来存储获取到的图片地址
Pattern p = Pattern.compile("<(img|IMG)(.*?)(>|></img>|/>)");//匹配字符串中的img标签
Matcher matcher = p.matcher(content);
boolean hasPic = matcher.find();
if(hasPic == true)//判断是否含有图片
{
while(hasPic) //如果含有图片,那么持续进行查找,直到匹配不到
{
String group = matcher.group(2);//获取第二个分组的内容,也就是 (.*?)匹配到的
Pattern srcText = Pattern.compile("(src|SRC)=(\"|\')(.*?)(\"|\')");//匹配图片的地址
Matcher matcher2 = srcText.matcher(group);
if( matcher2.find() )
{
srcList.add( matcher2.group(3) );//把获取到的图片地址添加到列表中
}
hasPic = matcher.find();//判断是否还有img标签
}
}
return srcList;
}
}
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.CookieStore;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
public class HuaNanShiFanStudent {
public static void main(String[] args) throws Exception {
String name = "学生信息列表";
// 创建excel
HSSFWorkbook wk = new HSSFWorkbook();
// 创建一张工作表
HSSFSheet sheet = wk.createSheet();
// 2
sheet.setColumnWidth(0, 5000);
HSSFRow row = sheet.createRow(0);
// 创建第一行的第一个单元格
// 想单元格写值
HSSFCell cell = row.createCell((short) 0);
cell.setCellValue("学号");
cell = row.createCell((short)1);
cell.setCellValue("姓名");
HuaNanShiFanStudent fanStudent = new HuaNanShiFanStudent();
List<TR> trs = fanStudent.HuaNanShiFanStudent_List();
for (int i=0;i<trs.size();i++) {
row = sheet.createRow(i+1);
row.createCell(0).setCellValue(trs.get(i).getTd0());
row.createCell(1).setCellValue(trs.get(i).getTd1());
}
wk.write(new FileOutputStream(new File("C:\\Users\\啊\\Desktop\\"+name+".xls")));
wk.close();
System.out.println("运行成功");
}
public List<TR> HuaNanShiFanStudent_List() throws Exception {
List<TR> trs = new ArrayList<TR>();
//用来存取cookies信息的变量
CookieStore store;
/**
* 请求第一次页面
*/
//第一次请求(登陆的请求)
DefaultHttpClient client1 = new DefaultHttpClient();
HttpPost request1 = new HttpPost("http://www.baidu.com/");
store = client1.getCookieStore();
HttpResponse response1 = client1.execute(request1);
if (response1.getStatusLine().getStatusCode() == 200) {
HttpEntity entity1 = response1.getEntity();
//获取
String result1 = EntityUtils.toString(entity1);
//使用jsoup 进行语言转换
Document doc1 = Jsoup.parse(result1);
//获取table表格
String string = doc1.getElementById("imgCode").toString();
//爬取验证码 图片/sso/authimg
PictureAddressUtil addressUtil = new PictureAddressUtil();
List<String> list = addressUtil.filePath(string);
/**
* 通过验证码 下载下来 并用tess4j图片识别其中的验证码
*/
DefaultHttpClient client2 = new DefaultHttpClient();
HttpPost request2 = new HttpPost("http://www.baidu.com/"+list.get(0)+"");
client2.setCookieStore(store);
HttpResponse response2 = client2.execute(request2);
downloadJPG(response2,"1.jpg");
String code = getImgContent("1.jpg");
System.out.println("验证码 = " + code);
System.out.println("===============================");
/**
* 登录请求
*/
DefaultHttpClient client3 = new DefaultHttpClient();
HttpPost request3 = new HttpPost("http://www.baidu.com?username=用户名&password=密码&authCode="+验证码+"");
client3.setCookieStore(store);
HttpResponse response3 = client3.execute(request3);
if (response3.getStatusLine().getStatusCode() == 200) {
HttpEntity entity3 = response3.getEntity();
//获取
String result3 = EntityUtils.toString(entity3);
System.out.println("Response content: " + result3);
System.err.println("===============================");
/**
* 请求
*/
DefaultHttpClient client4 = new DefaultHttpClient();
HttpPost request4 = new HttpPost("接口");
client4.setCookieStore(store);
HttpResponse response4 = client4.execute(request4);
if (response4.getStatusLine().getStatusCode() == 200) {
HttpEntity entity4 = response4.getEntity();
//获取
String json4 = EntityUtils.toString(entity4);
/* System.err.println("===============================");
System.out.println("Response content: " + json4);
System.out.println("==============================="); */
// JSONArray jsonArray4 = new JSONArray();
JSONObject jsonObject = JSONObject.fromObject(json4);
// System.out.println(str);
int totalCount = Integer.parseInt(jsonObject.getString("totalCount"));
System.out.println("totalCount========"+totalCount);
//循环查询的次数
int Index ;
int remaInder = totalCount%1000;
if(remaInder>0) {
Index = (totalCount/1000)+1;
}else {
Index = totalCount/1000;
}
if(Index>0) {
DefaultHttpClient client5 = new DefaultHttpClient();
HttpPost request5 = new HttpPost();
int start = 0;
int limit = 1000;
for (int n = 0; n < Index; n++) {
System.out.println(n);
request5.setURI(new URI( "http://www.baidu.com?&start="+start+"&limit="+limit+"&sort=id&dir=DESC"));
client5.setCookieStore(store);
HttpResponse response5 = client5.execute(request5);
if (response5.getStatusLine().getStatusCode() == 200) {
HttpEntity entity5 = response5.getEntity();
//获取
String json5 = EntityUtils.toString(entity5);
JSONObject jsonObject5 = JSONObject.fromObject(json5);
JSONArray data = jsonObject5.getJSONArray("models");
for (int i = 0; i<data.length();i++){
JSONObject jsonObject10 = data.getJSONObject(i);
String regNo = jsonObject10.getString("regNo");
String trueName = jsonObject10.getString("trueName");
String cardNo = jsonObject10.getJSONObject("prStudentInfo").getString("cardNo");
String genderNanme = jsonObject10.getJSONObject("prStudentInfo").getString("gender");
String peSite = jsonObject10.getJSONObject("peSite").getString("name");
String peGrade = jsonObject10.getJSONObject("peGrade").getString("name");
String peMajor = jsonObject10.getJSONObject("peMajor").getString("name");
String peEdutype = jsonObject10.getJSONObject("peMajor").getString("name");
String enumConstByFlagMajorType = jsonObject10.getJSONObject("enumConstByFlagMajorType").getString("name");//
String enumConstByFlagStudentStatus = jsonObject10.getJSONObject("enumConstByFlagStudentStatus").getString("name");
TR tr = new TR();
tr.setTd0(regNo);
tr.setTd1(trueName);
tr.setTd2(cardNo);
tr.setTd3(genderNanme);
tr.setTd4(peSite);
tr.setTd5(peGrade);
tr.setTd6(peMajor);
tr.setTd7(peEdutype);
tr.setTd8(enumConstByFlagMajorType);
tr.setTd9(enumConstByFlagStudentStatus);
trs.add(tr);
//System.out.println(trueName + peGradeName + peEdutypeNanme + peMajorName + inputDate + feeAmount);
}
}
start=start+1000;
}
}
}
}
}
return trs;
}
public static void downloadJPG(HttpResponse httpResponse,String fileName) throws IOException {
InputStream input = httpResponse.getEntity().getContent();
OutputStream output = new FileOutputStream(new File(fileName));
IOUtils.copy(input, output);
if (output != null) {
output.close();
}
output.flush();
}
public static String getImgContent(String imgUrl) {
String content = "";
File imageFile = new File(imgUrl);
//读取图片数字
ITesseract instance = new Tesseract();
instance.setDatapath("C:\\Program Files (x86)\\Tesseract-OCR\\tessdata");
//File tessDataFolder = LoadLibs.extractTessResources();
//C:\Users\啊\AppData\Local\Temp\tess4j\tessdata
instance.setLanguage("chi_sim");//英文库识别数字比较准确
//instance.setDatapath(tessDataFolder.getAbsolutePath());
try {
content = instance.doOCR(imageFile).replace("\n", "");
System.out.println(content);
} catch (TesseractException e) {
System.err.println(e.getMessage());
}
return content;
}
}