先看一下爬虫的数据结构
说一下自己的需求,该找工作了,那么就要做好充足的准备,先把市面上能找到的工作都收集好,看看那个适合自己把.
爬了很多坑,首先webmagic框架的爬虫监控不是特别友好,如果想实时监管,需要改源码之类的,这种扩展太麻烦了,毕竟不是爬虫工程师…
其次,动态页面的数据,解密起来挺费劲,需要使用postman来查找api,想想还是太麻烦,我们还是先实现在优化把
1.最好用maven的springboot来搞,因为他自带sljf,是这么叫吗?我自己单独配了半天它的依赖还是看不了状态,放到springboot上一跑,果然啥都有了
package com.tianliangedu.spider;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.processor.PageProcessor;
public class fbossSpider_V2 implements PageProcessor {
static int i = 0;
private Site site = Site.me().setRetryTimes(1).setSleepTime(3000);
public Site getSite() {
return site;
}
public void process(Page page) {
page.addTargetRequests(page.getHtml()
.css("[class=page]")
.links().all());
if (page.getResultItems().get("name") == null) {
page.setSkip(true);
}
List<String> list = page.getHtml().css("[class=job-primary]").all();
String jsonStr2;
FileOutputStream fos = null;
OutputStreamWriter writer = null;
try {
fos=new FileOutputStream("G://石家庄javaweb_V1.txt", true);
writer = new OutputStreamWriter(fos, "UTF-8");
for (String string : list) {
jsonStr2 = string;
writer.append(jsonStr2);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
fos.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
page.addTargetRequests(page.getHtml()
.xpath("/html/body/div[1]/div[3]/div/div[2]/div[3]").links()
.all());
i++;
}
public static void main(String[] args) throws Exception {
// 创建爬虫实体类
fbossSpider_V2 selemium = new fbossSpider_V2();
// 设置selemium浏览器配置驱动
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(
"G:\\爬虫\\drive\\chromedriver.exe");
seleniumDownloader.setSleepTime(5000);
// 配置当前浏览器配置
System.setProperty("selenuim_config",
"G:\\workspace\\Git\\webmagic\\config.ini");
// request类型配置,使用responsebody配置请求头,调用method方法选择post/get请求
Request request = new Request();
// 设置被爬取页面
String st = "https://www.zhipin.com/job_detail/?query=%E7%9F%B3%E5%AE%B6%E5%BA%84javaweb&city=100010000&industry=&position=";
// spider对象用于监控
Spider obj = Spider.create(selemium).addUrl(st)
.setDownloader(seleniumDownloader).thread(1);
// 注册监控
SpiderMonitor lister = SpiderMonitor.instance().register(obj);
obj.start();
}
}
依赖放在这
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.tianliangedu</groupId>
<artifactId>WebMagic_SpringBoot</artifactId>
<version>0.0.1-SNAPSHOT</version>
<!-- 属性配置 -->
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
<java_source_version>1.8</java_source_version>
</properties>
<!-- 首先配置仓库的服务器位置,首选阿里云,也可以配置镜像方式,效果雷同 -->
<repositories>
<repository>
<id>nexus-aliyun</id>
<name>Nexus aliyun</name>
<url>http://maven.aliyun.com/nexus/content/groups/public</url>
</repository>
<repository>
<id>spring-milestone</id>
<url>http://repo.spring.io/libs-release</url>
</repository>
</repositories>
<!-- Spring boot 父引用 -->
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.4.0.RELEASE</version>
</parent>
<dependencies>
<!-- Spring boot 核心web -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- Spring boot web页面模板引擎 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-thymeleaf</artifactId>
</dependency>
<!-- 添加mysql jdbc依赖 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<!-- 添加springboot jdbcTemplate依赖 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
<!-- fastjson依赖 -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.15</version>
</dependency>
<!-- 解决thymeleaf模板引擎对h5页面检查太严格问题 -->
<dependency>
<groupId>net.sourceforge.nekohtml</groupId>
<artifactId>nekohtml</artifactId>
<version>1.9.22</version>
</dependency>
<!-- druid依赖加入 -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.1.6</version>
</dependency>
<!-- 解决的是热启动、热布署的问题 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<optional>true</optional>
</dependency>
<!-- webmagic+selenium配置依赖 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-selenium</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-chrome-driver</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-server</artifactId>
<version>2.18.0</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
<!-- 打成可执行的jar,并指定主启动类 -->
<build>
<plugins>
<!-- <plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<mainClass>com.tianliangedu.boot.ApplicationBootControler</mainClass>
</configuration>
</plugin> -->
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<!-- <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId>
<configuration> <mainClass>com.tl.job008.controller.root.SystemBootController</mainClass>
</configuration> </plugin> -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<!-- 解决资源文件的编码问题 -->
<configuration>
<encoding>UTF-8</encoding>
</configuration>
<executions>
<execution>
<id>copy-resources</id>
<phase>validate</phase>
<goals>
<goal>copy-resources</goal>
</goals>
<configuration>
<!-- 把资源文件打包到工程目录下,与下面的打包不包含资源文件对应,也可以是其它目录,比方conf,这样写${project.build.directory}/conf -->
<outputDirectory>${project.build.directory}</outputDirectory>
<resources>
<resource>
<directory>src/main/resources</directory>
<filtering>true</filtering>
</resource>
</resources>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>lib/</classpathPrefix>
<mainClass>com.tianliangedu.boot.ApplicationBootControler</mainClass><!--
main函数入口 -->
<addDefaultImplementationEntries>true</addDefaultImplementationEntries>
<addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
</manifest>
<manifestEntries>
<Permissions>${Permissions}</Permissions>
<Caller-Allowable-Codebase>${Caller-Allowable-Codebase}</Caller-Allowable-Codebase>
<Class-Path>./</Class-Path>
</manifestEntries>
</archive>
</configuration>
</plugin>
<!-- 解决资源文件的编码问题 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>${java_source_version}</source>
<target>${java_target_version}</target>
<encoding>${file_encoding}</encoding>
<showDeprecation>true</showDeprecation>
<showWarnings>true</showWarnings>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>package</phase>
<configuration>
<overWriteReleases>false</overWriteReleases>
<overWriteSnapshots>false</overWriteSnapshots>
<overWriteIfNewer>true</overWriteIfNewer>
<!-- 将引用依赖的jar包打在lib目录下 -->
<outputDirectory>
${project.build.directory}/lib
</outputDirectory>
</configuration>
<goals>
<goal>copy-dependencies</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
boss单个词条貌似只能爬取10-14页内容,这个限制目前没研究,找工作是够用了
这点数据,明天搞到hive里来一遍,可视化一波也还行.
哦对,菜鸟一个,如果有大佬能告诉俺oos的问题就好了,Pipeline这个对象,甭管我咋搞,用自带的filepipelin还是自定义的都不输出,自己装的mongodb也无用武之处…
附加好玩代码
boos直聘–seleiunm代码
package spider_Data;
import org.openqa.selenium.By;
import org.openqa.selenium.Keys;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
public class likePeople {
public static void main(String[] args) throws Exception {
//设置selemium浏览器配置驱动
SeleniumDownloader seleniumDownloader=new SeleniumDownloader("G:\\爬虫\\drive\\chromedriver.exe");
//配置当前浏览器配置
System.setProperty("selenuim_config", "G:\\workspace\\Git\\webmagic\\config.ini");
//request类型配置,使用responsebody配置请求头,调用method方法选择post/get请求
Request request=new Request();
//设置被爬取页面
String st="https://www.zhipin.com/";
WebDriver driver = new ChromeDriver();
Thread.sleep(500);
driver.manage().window().maximize();
Thread.sleep(500);
driver.get(st);
Thread.sleep(500);
System.out.println("输入搜索内容");
driver.findElement(By.name("query")).sendKeys("石家庄大数据");
driver.findElement(By.name("query")).sendKeys(Keys.ENTER);
WebElement obj=driver.findElement(By.xpath("//*[@id=\"wrap\"]/div[3]/div/div/div[1]/form/button"));
driver.switchTo().frame(obj);
// driver.findElement(By.cssSelector(".button")).click();
// driver.findElement(By.xpath("//*[@id=\"wrap\"]/div[3]/div/div/div[1]"))
// .findElement(By.cssSelector("*[@id=\"wrap\"]/div[3]/div/div/div[1]/form/button")).click();
Thread.sleep(3000);
}
}
奥对,还有init配置文件
# What WebDriver to use for the tests
#driver=phantomjs
#driver=firefox
driver=chrome
#driver=http://localhost:8910
#driver=http://localhost:4444/wd/hub
# PhantomJS specific config (change according to your installation)
#phantomjs_exec_path=/Users/Bingo/bin/phantomjs-qt5
#phantomjs_exec_path=d:/phantomjs.exe
chrome_exec_path=C:\Users\sky\AppData\Local\Google\Chrome\Application\chrome.exe
#phantomjs_driver_path=/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/src/main.js
#phantomjs_driver_loglevel=DEBUG
chrome_driver_loglevel=DEBUG
这个文件随便扔个地方吧,总之项目中的相对路径,怎默写都找不到它这个龟儿子…tmd,写绝对路径包全家头发安康
driver自己下载匹配浏览器的版本就行