java爬虫框架哪个好_Java爬虫框架--WebMagic

爬取世纪佳缘小姐姐信息

/*** @auther mxh

* @time 2019/5/17 13:44

* 信息实体类*/

public classInfo {privateInteger id;//昵称

privateString name;//照片

privateString image;//基本信息

privateString info;//爱情宣言

privateString mottos;//推荐理由

privateString reason;publicInfo() {

}publicInfo(String name, String image, String info, String mottos, String reason) {this.name =name;this.image =image;this.info =info;this.mottos =mottos;this.reason =reason;

}publicInteger getId() {returnid;

}public voidsetId(Integer id) {this.id =id;

}publicString getName() {returnname;

}public voidsetName(String name) {this.name =name;

}publicString getImage() {returnimage;

}public voidsetImage(String image) {this.image =image;

}publicString getInfo() {returninfo;

}public voidsetInfo(String info) {this.info =info;

}publicString getMottos() {returnmottos;

}public voidsetMottos(String mottos) {this.mottos =mottos;

}publicString getReason() {returnreason;

}public voidsetReason(String reason) {this.reason =reason;

}

@OverridepublicString toString() {return "Info{" +

"id=" + id +

", name='" + name + '\'' +

", image='" + image + '\'' +

", info='" + info + '\'' +

", mottos='" + mottos + '\'' +

", reason='" + reason + '\'' +

'}';

}

dao层

importorg.springframework.stereotype.Repository;/*** @auther mxh

* @time 2019/5/17 13:46*/@Repositorypublic interfaceSJJYMapper {intaddInfo(Info info);

}

爬虫框架持久层

importorg.springframework.beans.factory.annotation.Autowired;importorg.springframework.stereotype.Service;importus.codecraft.webmagic.ResultItems;importus.codecraft.webmagic.Task;importus.codecraft.webmagic.pipeline.Pipeline;/*** @auther mxh

* @time 2019/5/17 13:59

*

* 爬虫框架dao层*/@Servicepublic class SJJYPipeline implementsPipeline {

@AutowiredprivateSJJYMapper sjjyMapper;

@Overridepublic voidprocess(ResultItems resultItems, Task task) {

System.out.println("get page: " +resultItems.getRequest().getUrl());

String[] names= resultItems.get("names").toString().split(",");

String[] images= resultItems.get("images").toString().split(",");

String[] infos= resultItems.get("infos").toString().split(",");

String[] mottoes= resultItems.get("mottoes").toString().split(",");

String[] reasons= resultItems.get("reasons").toString().split(",");for (int i=0;i

Info info= newInfo(names[i],images[i],infos[i],mottoes[i],reasons[i]);

sjjyMapper.addInfo(info);

System.out.println("add info: " +info.toString());

}

}

}

爬虫框架数据筛选逻辑层

importorg.apache.http.Header;importorg.apache.http.HttpResponse;importorg.apache.http.NameValuePair;importorg.apache.http.client.config.RequestConfig;importorg.apache.http.client.entity.UrlEncodedFormEntity;importorg.apache.http.client.methods.CloseableHttpResponse;importorg.apache.http.client.methods.HttpGet;importorg.apache.http.client.methods.HttpPost;importorg.apache.http.client.protocol.HttpClientContext;importorg.apache.http.client.utils.DateUtils;importorg.apache.http.client.utils.URIBuilder;importorg.apache.http.config.Registry;importorg.apache.http.config.RegistryBuilder;importorg.apache.http.cookie.CookieOrigin;importorg.apache.http.cookie.CookieSpecProvider;importorg.apache.http.cookie.MalformedCookieException;importorg.apache.http.impl.client.CloseableHttpClient;importorg.apache.http.impl.client.DefaultHttpClient;importorg.apache.http.impl.client.HttpClients;importorg.apache.http.impl.cookie.DefaultCookieSpec;importorg.apache.http.message.BasicHeader;importorg.openqa.selenium.By;importorg.openqa.selenium.Cookie;importorg.openqa.selenium.WebDriver;importorg.openqa.selenium.chrome.ChromeDriver;importorg.springframework.stereotype.Service;importus.codecraft.webmagic.Page;importus.codecraft.webmagic.Site;importus.codecraft.webmagic.processor.PageProcessor;importus.codecraft.webmagic.selector.Html;import java.io.*;importjava.net.HttpURLConnection;importjava.net.URISyntaxException;importjava.net.URL;importjava.net.URLEncoder;import java.util.*;/*** @auther mxh

* @time 2019/5/16 17:01

*

* 爬虫框架数据筛选逻辑层*/@Servicepublic class SJJYProcessor implementsPageProcessor {private Site site = Site.me().setCharset("utf8").setRetryTimes(1000).setSleepTime(1000);//用来存储cookie信息

private Setcookies;

@Overridepublic voidprocess(Page page) {

Html html=page.getHtml();//照片

List images = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanPic\"]/a/img/@_src").all();//姓名

List names = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanName\"]/a/text()").all();//基本信息

List infos = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanArea\"]/text()").all();//爱情宣言

List mottos = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanText\"]/text()").all();//推荐理由

List reasons = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanLy\"]/text()").all();/*输出到控制台 并使dao层接收到数据*/page.putField("names",names);

page.putField("images",images);

page.putField("infos",infos);

page.putField("mottoes",mottos);

page.putField("reasons",reasons);

}

@OverridepublicSite getSite() {//设置主机地址

site.setDomain("www.jiayuan.com");//手动设置cookie//site.addCookie("PHPSESSID","f16de947c3a48a1084d22dd7e72cd283");

/*site.addCookie("PHPSESSID","8b392aacbf80a4d6cf102938271273a7");

site.addCookie("COMMON_HASH","0d8c3daa82c80277292723d74ff197d0");

site.addCookie("PROFILE","207838031%3A%25E5%25BD%25BC%25E5%25BE%2597%25E5%25B8%2595%25E5%2585%258B%3Am%3Aimages1.jyimg.com%2Fw4%2Fglobal%2Fi%3A0%3A%3A1%3Azwzp_m.jpg%3A1%3A1%3A50%3A10%3A3.0");

site.addCookie("RAW_HASH","fYGR2xG5XJL10gfFF4mP3qO0yN65wBrTZpeOrelDWKHerbx69EjQ138l9BfHlTYP%2AGuyrs-5xYCSsUMipqBNkKqExN%2AWVe7sWAWAa5w8VXf-TMA.");

site.addCookie("SESSION_HASH","c2dbd047d891295d1b3e4d5b4cb687e71eeb1afd");

site.addCookie("accessID","20190516163650639629");

site.addCookie("ip_loc","31");

site.addCookie("save_jy_login_name","15735400536");

site.addCookie("stadate1","206838031");

site.addCookie("user_access","1");

site.addCookie("main_search:207838031","%7C%7C%7C00");

site.addCookie("last_login_time","1558057676");*/

//自动追加

for(org.apache.http.cookie.Cookie cookie :cookies) {

site.addCookie(cookie.getName().toString(), cookie.getValue().toString());

}returnsite;

}

// 自动登陆方法

public void login() {

//注册chrome

System.setProperty("webdriver.chrome.driver", "D:\\chromedriver.exe");

WebDriver driver = new ChromeDriver();

driver.get("http://login.jiayuan.com/?refrer=http://www.jiayuan.com&host=0");// 打开网址

// 防止页面未能及时加载出来而设置一段时间延迟

try {

Thread.sleep(2000);

} catch (InterruptedException e) {

e.printStackTrace();

}

// 设置用户名密码

driver.findElement(By.id("login_email")).sendKeys("15735400536"); // 用户名

driver.findElement(By.id("login_password")).sendKeys("mxh970923"); // 密码

// 模拟点击 //form[@id='form-group-login']/button

driver.findElement(By.xpath("//*[@id=\"login_btn\"]"))

.click(); // xpath语言:id为form-group-login的form下的button

// 防止页面未能及时加载出来而设置一段时间延迟

try {

Thread.sleep(15000);

} catch (InterruptedException e) {

e.printStackTrace();

}

// 获取cookie信息

cookies = driver.manage().getCookies();

driver.close();

}

controller

importorg.apache.http.client.methods.HttpGet;importorg.apache.http.impl.client.BasicCookieStore;importorg.apache.http.impl.client.CloseableHttpClient;importorg.apache.http.impl.client.HttpClients;importorg.springframework.beans.factory.annotation.Autowired;importorg.springframework.stereotype.Controller;import org.springframework.web.bind.annotation.*;importus.codecraft.webmagic.Spider;importorg.apache.http.cookie.Cookie;importjava.util.List;importorg.apache.http.client.CookieStore;/*** @auther mxh

* @time 2019/5/16 17:10*/@Controllerpublic classSJJYController {

@AutowiredprivateSJJYProcessor sjjyProcessor;

@AutowiredprivateSJJYPipeline sjjyPipeline;

@ResponseBody

@RequestMapping(value= "/start",method =RequestMethod.GET)publicString start(){//模拟浏览器自动登录

sjjyProcessor.login();for (int i=1;i<=9;i++){

Spider.create(sjjyProcessor)

.addUrl("http://www.jiayuan.com/usercp/dynmatch/ajax/jymatch_list.php?p="+i)

.addPipeline(sjjyPipeline)

.thread(5)

.run();

}return "success";

}

@ResponseBody

@RequestMapping(value= "/login",method =RequestMethod.GET)publicString login(){

String url="https://passport.jiayuan.com/dologin.php?pre_url=http://usercp.jiayuan.com/v2/";try{

sjjyProcessor.getCookieBySendPost(url);

}catch(Exception e) {

e.printStackTrace();

}return "login success";

}

@ResponseBody

@RequestMapping(value= "/test2",method =RequestMethod.GET)publicString test2(){/*// TODO Auto-generated method stub

String url="https://passport.jiayuan.com/dologin.php?pre_url=http://usercp.jiayuan.com/v2/";

//POST的URL

HttpPost httppost=new HttpPost(url);

//建立HttpPost对象

List params=new ArrayList();

//建立一个NameValuePair数组,用于存储欲传送的参数

params.add(new BasicNameValuePair("pwd","2544"));

HttpResponse response = null;

//添加参数

try {

httppost.setEntity(new UrlEncodedFormEntity(params, HTTP.UTF_8));

//设置编码

response = new DefaultHttpClient().execute(httppost);

} catch (UnsupportedEncodingException e) {

e.printStackTrace();

} catch (IOException e){

e.printStackTrace();

}

//发送Post,并返回一个HttpResponse对象

//Header header = response.getFirstHeader("Content-Length");

//String Length=header.getValue();

// 上面两行可以得到指定的Header

if(response.getStatusLine().getStatusCode()==200){//如果状态码为200,就是正常返回

String result= response.getEntity().getContent();

//得到返回的字符串

System.out.println(result);

}*/

//TODO Auto-generated method stub

CloseableHttpClient httpClient = null;//创建GET请求

HttpGet httpget = new HttpGet("https://passport.jiayuan.com/dologin.php?pre_url=http://usercp.jiayuan.com/v2/");

String result= null;try{

CookieStore cookieStore= newBasicCookieStore();

httpClient=HttpClients.custom().setDefaultCookieStore(cookieStore).build();

httpClient.execute(httpget);

String PHPSESSID= null;

List cookies =cookieStore.getCookies();

System.out.println(cookies);for (int i = 0; i < cookies.size(); i++) {if (cookies.get(i).getName().equals("PHPSESSID")) {

PHPSESSID=cookies.get(i).getValue();

System.out.println(PHPSESSID);

}

}

}catch(Exception ex) {

ex.printStackTrace();

}return "Hello World";

}

}

application.properties

server.port=8001mybatis.type-aliases-package=com.example.shijijiayuan.demo

mybatis.mapper-locations=classpath*:mapper.xml

spring.datasource.url=jdbc:mysql://localhost:3306/******

spring.datasource.username=root

spring.datasource.password=root

spring.datasource.driver-class-name=com.mysql.jdbc.Driver

mapper.xml

INSERT INTO info(name,image,info,mottos,reason) VALUES(#{name}, #{image}, #{info}, #{mottos}, #{reason})

pom.xml

4.0.0

org.springframework.boot

spring-boot-starter-parent

2.1.5.RELEASE

com.example

shijijiayuan

0.0.1-SNAPSHOT

shijijiayuan

Demo project for Spring Boot

1.8

org.springframework.boot

spring-boot-starter-web

org.springframework.boot

spring-boot-starter-test

test

us.codecraft

webmagic-core

0.7.3

us.codecraft

webmagic-extension

0.7.3

org.mybatis.spring.boot

mybatis-spring-boot-starter

2.0.1

mysql

mysql-connector-java

5.1.30

org.seleniumhq.selenium

selenium-api

3.14.0

org.seleniumhq.selenium

selenium-chrome-driver

3.14.0

org.seleniumhq.selenium

selenium-api

3.14.0

org.apache.httpcomponents

httpclient

4.5.8

org.springframework.boot

spring-boot-maven-plugin

记得下载相应的浏览器驱动,注意版本号要一致哦

博主这里用的是谷歌浏览器驱动

世纪佳缘网站登录要做验证码验证,博主暂时不会写那么智能的代码,所以只能手动选择了

代码有些jar包可能导的不正确,注意哦,不要盲目copy

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值