WebMagic框架教程 http://webmagic.io/docs/zh/
爬取世纪佳缘小姐姐信息
/** * @auther mxh * @time 2019/5/17 13:44 * 信息实体类 */ public class Info { private Integer id; //昵称 private String name; // 照片 private String image; //基本信息 private String info; //爱情宣言 private String mottos; //推荐理由 private String reason; public Info() { } public Info(String name, String image, String info, String mottos, String reason) { this.name = name; this.image = image; this.info = info; this.mottos = mottos; this.reason = reason; } public Integer getId() { return id; } public void setId(Integer id) { this.id = id; } public String getName() { return name; } public void setName(String name) { this.name = name; } public String getImage() { return image; } public void setImage(String image) { this.image = image; } public String getInfo() { return info; } public void setInfo(String info) { this.info = info; } public String getMottos() { return mottos; } public void setMottos(String mottos) { this.mottos = mottos; } public String getReason() { return reason; } public void setReason(String reason) { this.reason = reason; } @Override public String toString() { return "Info{" + "id=" + id + ", name='" + name + '\'' + ", image='" + image + '\'' + ", info='" + info + '\'' + ", mottos='" + mottos + '\'' + ", reason='" + reason + '\'' + '}'; }
dao层
import org.springframework.stereotype.Repository; /** * @auther mxh * @time 2019/5/17 13:46 */ @Repository public interface SJJYMapper { int addInfo(Info info); }
爬虫框架持久层
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; /** * @auther mxh * @time 2019/5/17 13:59 * * 爬虫框架dao层 */ @Service public class SJJYPipeline implements Pipeline { @Autowired private SJJYMapper sjjyMapper; @Override public void process(ResultItems resultItems, Task task) { System.out.println("get page: " + resultItems.getRequest().getUrl()); String[] names = resultItems.get("names").toString().split(","); String[] images = resultItems.get("images").toString().split(","); String[] infos = resultItems.get("infos").toString().split(","); String[] mottoes = resultItems.get("mottoes").toString().split(","); String[] reasons = resultItems.get("reasons").toString().split(","); for (int i=0;i<names.length;i++){ Info info = new Info(names[i],images[i],infos[i],mottoes[i],reasons[i]); sjjyMapper.addInfo(info); System.out.println("add info: " + info.toString()); } } }
爬虫框架数据筛选逻辑层
import org.apache.http.Header; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.client.utils.DateUtils; import org.apache.http.client.utils.URIBuilder; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.cookie.CookieOrigin; import org.apache.http.cookie.CookieSpecProvider; import org.apache.http.cookie.MalformedCookieException; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.cookie.DefaultCookieSpec; import org.apache.http.message.BasicHeader; import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; import org.springframework.stereotype.Service; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; import java.io.*; import java.net.HttpURLConnection; import java.net.URISyntaxException; import java.net.URL; import java.net.URLEncoder; import java.util.*; /** * @auther mxh * @time 2019/5/16 17:01 * * 爬虫框架数据筛选逻辑层 */ @Service public class SJJYProcessor implements PageProcessor { private Site site = Site.me().setCharset("utf8").setRetryTimes(1000).setSleepTime(1000); // 用来存储cookie信息 private Set<Cookie> cookies; @Override public void process(Page page) { Html html = page.getHtml(); //照片 List<String> images = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanPic\"]/a/img/@_src").all(); //姓名 List<String> names = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanName\"]/a/text()").all(); //基本信息 List<String> infos = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanArea\"]/text()").all(); //爱情宣言 List<String> mottos = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanText\"]/text()").all(); //推荐理由 List<String> reasons = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanLy\"]/text()").all(); /*输出到控制台 并使dao层接收到数据*/ page.putField("names",names); page.putField("images",images); page.putField("infos",infos); page.putField("mottoes",mottos); page.putField("reasons",reasons); } @Override public Site getSite() { //设置主机地址 site.setDomain("www.jiayuan.com"); //手动设置cookie //site.addCookie("PHPSESSID","f16de947c3a48a1084d22dd7e72cd283"); /*site.addCookie("PHPSESSID","8b392aacbf80a4d6cf102938271273a7"); site.addCookie("COMMON_HASH","0d8c3daa82c80277292723d74ff197d0"); site.addCookie("PROFILE","207838031%3A%25E5%25BD%25BC%25E5%25BE%2597%25E5%25B8%2595%25E5%2585%258B%3Am%3Aimages1.jyimg.com%2Fw4%2Fglobal%2Fi%3A0%3A%3A1%3Azwzp_m.jpg%3A1%3A1%3A50%3A10%3A3.0"); site.addCookie("RAW_HASH","fYGR2xG5XJL10gfFF4mP3qO0yN65wBrTZpeOrelDWKHerbx69EjQ138l9BfHlTYP%2AGuyrs-5xYCSsUMipqBNkKqExN%2AWVe7sWAWAa5w8VXf-TMA."); site.addCookie("SESSION_HASH","c2dbd047d891295d1b3e4d5b4cb687e71eeb1afd"); site.addCookie("accessID","20190516163650639629"); site.addCookie("ip_loc","31"); site.addCookie("save_jy_login_name","15735400536"); site.addCookie("stadate1","206838031"); site.addCookie("user_access","1"); site.addCookie("main_search:207838031","%7C%7C%7C00"); site.addCookie("last_login_time","1558057676");*/ //自动追加 for (org.apache.http.cookie.Cookie cookie : cookies) {
site.addCookie(cookie.getName().toString(), cookie.getValue().toString());
}
return site;
}
// 自动登陆方法
public void login() {
//注册chrome
System.setProperty("webdriver.chrome.driver", "D:\\chromedriver.exe");
WebDriver driver = new ChromeDriver();
driver.get("http://login.jiayuan.com/?refrer=http://www.jiayuan.com&host=0");// 打开网址
// 防止页面未能及时加载出来而设置一段时间延迟
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
e.printStackTrace();
}
// 设置用户名密码
driver.findElement(By.id("login_email")).sendKeys("15735400536"); // 用户名
driver.findElement(By.id("login_password")).sendKeys("mxh970923"); // 密码
// 模拟点击 //form[@id='form-group-login']/button
driver.findElement(By.xpath("//*[@id=\"login_btn\"]"))
.click(); // xpath语言:id为form-group-login的form下的button
// 防止页面未能及时加载出来而设置一段时间延迟
try {
Thread.sleep(15000);
} catch (InterruptedException e) {
e.printStackTrace();
}
// 获取cookie信息
cookies = driver.manage().getCookies();
driver.close();
}
controller
import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Controller; import org.springframework.web.bind.annotation.*; import us.codecraft.webmagic.Spider; import org.apache.http.cookie.Cookie; import java.util.List; import org.apache.http.client.CookieStore; /** * @auther mxh * @time 2019/5/16 17:10 */ @Controller public class SJJYController { @Autowired private SJJYProcessor sjjyProcessor; @Autowired private SJJYPipeline sjjyPipeline; @ResponseBody @RequestMapping(value = "/start",method = RequestMethod.GET) public String start(){ //模拟浏览器自动登录 sjjyProcessor.login(); for (int i=1;i<=9;i++){ Spider.create(sjjyProcessor) .addUrl("http://www.jiayuan.com/usercp/dynmatch/ajax/jymatch_list.php?p="+i) .addPipeline(sjjyPipeline) .thread(5) .run(); } return "success"; } @ResponseBody @RequestMapping(value = "/login",method = RequestMethod.GET) public String login(){ String url ="https://passport.jiayuan.com/dologin.php?pre_url=http://usercp.jiayuan.com/v2/"; try { sjjyProcessor.getCookieBySendPost(url); } catch (Exception e) { e.printStackTrace(); } return "login success"; } @ResponseBody @RequestMapping(value = "/test2",method = RequestMethod.GET) public String test2(){ /*// TODO Auto-generated method stub String url="https://passport.jiayuan.com/dologin.php?pre_url=http://usercp.jiayuan.com/v2/"; //POST的URL HttpPost httppost=new HttpPost(url); //建立HttpPost对象 List<NameValuePair> params=new ArrayList<NameValuePair>(); //建立一个NameValuePair数组,用于存储欲传送的参数 params.add(new BasicNameValuePair("pwd","2544")); HttpResponse response = null; //添加参数 try { httppost.setEntity(new UrlEncodedFormEntity(params, HTTP.UTF_8)); //设置编码 response = new DefaultHttpClient().execute(httppost); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (IOException e){ e.printStackTrace(); } //发送Post,并返回一个HttpResponse对象 //Header header = response.getFirstHeader("Content-Length"); //String Length=header.getValue(); // 上面两行可以得到指定的Header if(response.getStatusLine().getStatusCode()==200){//如果状态码为200,就是正常返回 String result= response.getEntity().getContent(); //得到返回的字符串 System.out.println(result); }*/ // TODO Auto-generated method stub CloseableHttpClient httpClient = null; //创建GET请求 HttpGet httpget = new HttpGet("https://passport.jiayuan.com/dologin.php?pre_url=http://usercp.jiayuan.com/v2/"); String result = null; try { CookieStore cookieStore = new BasicCookieStore(); httpClient = HttpClients.custom().setDefaultCookieStore(cookieStore).build(); httpClient.execute(httpget); String PHPSESSID = null; List<Cookie> cookies = cookieStore.getCookies(); System.out.println(cookies); for (int i = 0; i < cookies.size(); i++) { if (cookies.get(i).getName().equals("PHPSESSID")) { PHPSESSID = cookies.get(i).getValue(); System.out.println(PHPSESSID); } } } catch (Exception ex) { ex.printStackTrace(); } return "Hello World"; } }
application.properties
server.port=8001 mybatis.type-aliases-package=com.example.shijijiayuan.demo mybatis.mapper-locations=classpath*:mapper.xml spring.datasource.url=jdbc:mysql://localhost:3306/****** spring.datasource.username=root spring.datasource.password=root spring.datasource.driver-class-name=com.mysql.jdbc.Driver
mapper.xml
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
<mapper namespace="com.example.shijijiayuan.demo.SJJYMapper" >
<insert id="addInfo" parameterType="com.example.shijijiayuan.demo.Info">
INSERT INTO info(name,image,info,mottos,reason) VALUES(#{name}, #{image}, #{info}, #{mottos}, #{reason})
</insert>
</mapper>
pom.xml
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.1.5.RELEASE</version> <relativePath/> <!-- lookup parent from repository --> </parent> <groupId>com.example</groupId> <artifactId>shijijiayuan</artifactId> <version>0.0.1-SNAPSHOT</version> <name>shijijiayuan</name> <description>Demo project for Spring Boot</description> <properties> <java.version>1.8</java.version> </properties> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <!--WebMagic--> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> <!--myBatis--> <dependency> <groupId>org.mybatis.spring.boot</groupId> <artifactId>mybatis-spring-boot-starter</artifactId> <version>2.0.1</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.30</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-api</artifactId> <version>3.14.0</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-chrome-driver</artifactId> <version>3.14.0</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-api</artifactId> <version>3.14.0</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.8</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> </plugin> </plugins> </build> </project>
记得下载相应的浏览器驱动,注意版本号要一致哦
博主这里用的是谷歌浏览器驱动
世纪佳缘网站登录要做验证码验证,博主暂时不会写那么智能的代码,所以只能手动选择了
代码有些jar包可能导的不正确,注意哦,不要盲目copy