boss爬虫连淦6小时-webmagic+selenium实现

先看一下爬虫的数据结构
在这里插入图片描述
在这里插入图片描述

说一下自己的需求,该找工作了,那么就要做好充足的准备,先把市面上能找到的工作都收集好,看看那个适合自己把.

爬了很多坑,首先webmagic框架的爬虫监控不是特别友好,如果想实时监管,需要改源码之类的,这种扩展太麻烦了,毕竟不是爬虫工程师…
其次,动态页面的数据,解密起来挺费劲,需要使用postman来查找api,想想还是太麻烦,我们还是先实现在优化把

1.最好用maven的springboot来搞,因为他自带sljf,是这么叫吗?我自己单独配了半天它的依赖还是看不了状态,放到springboot上一跑,果然啥都有了
在这里插入图片描述

package com.tianliangedu.spider;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.List;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.processor.PageProcessor;

public class fbossSpider_V2 implements PageProcessor {
	static int i = 0;

	private Site site = Site.me().setRetryTimes(1).setSleepTime(3000);

	public Site getSite() {
		return site;
	}

	public void process(Page page) {
		page.addTargetRequests(page.getHtml()
				.css("[class=page]")
				.links().all());
		if (page.getResultItems().get("name") == null) {
			page.setSkip(true);
		}


		List<String> list = page.getHtml().css("[class=job-primary]").all();
		String jsonStr2;
		FileOutputStream fos = null;
		 OutputStreamWriter writer = null;
		try {
					 fos=new FileOutputStream("G://石家庄javaweb_V1.txt", true);
					     writer = new OutputStreamWriter(fos, "UTF-8");
					for (String string : list) {
				jsonStr2 = string;
				   writer.append(jsonStr2);
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			 try {
				writer.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			    try {
					fos.close();
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
		}
		
		
		page.addTargetRequests(page.getHtml()
				.xpath("/html/body/div[1]/div[3]/div/div[2]/div[3]").links()
				.all());
		i++;
	}

	public static void main(String[] args) throws Exception {
		// 创建爬虫实体类
		fbossSpider_V2 selemium = new fbossSpider_V2();
		// 设置selemium浏览器配置驱动
		SeleniumDownloader seleniumDownloader = new SeleniumDownloader(
				"G:\\爬虫\\drive\\chromedriver.exe");
		seleniumDownloader.setSleepTime(5000);
		// 配置当前浏览器配置
		System.setProperty("selenuim_config",
				"G:\\workspace\\Git\\webmagic\\config.ini");
		// request类型配置,使用responsebody配置请求头,调用method方法选择post/get请求
		Request request = new Request();
		// 设置被爬取页面
		String st = "https://www.zhipin.com/job_detail/?query=%E7%9F%B3%E5%AE%B6%E5%BA%84javaweb&city=100010000&industry=&position=";
		// spider对象用于监控
		Spider obj = Spider.create(selemium).addUrl(st)
				.setDownloader(seleniumDownloader).thread(1);
		// 注册监控
		SpiderMonitor lister = SpiderMonitor.instance().register(obj);
		obj.start();
	}

}

依赖放在这

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.tianliangedu</groupId>
	<artifactId>WebMagic_SpringBoot</artifactId>
	<version>0.0.1-SNAPSHOT</version>

	<!-- 属性配置 -->
	<properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
		<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
		<java.version>1.8</java.version>
		<java_source_version>1.8</java_source_version>
	</properties>

	<!-- 首先配置仓库的服务器位置,首选阿里云,也可以配置镜像方式,效果雷同 -->
	<repositories>
		<repository>
			<id>nexus-aliyun</id>
			<name>Nexus aliyun</name>
			<url>http://maven.aliyun.com/nexus/content/groups/public</url>
		</repository>
		<repository>
			<id>spring-milestone</id>
			<url>http://repo.spring.io/libs-release</url>
		</repository>
	</repositories>

	<!-- Spring boot 父引用 -->
	<parent>
		<groupId>org.springframework.boot</groupId>
		<artifactId>spring-boot-starter-parent</artifactId>
		<version>1.4.0.RELEASE</version>
	</parent>

	<dependencies>
		<!-- Spring boot 核心web -->
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-web</artifactId>
		</dependency>

		<!-- Spring boot web页面模板引擎 -->
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-thymeleaf</artifactId>
		</dependency>

		<!-- 添加mysql jdbc依赖 -->
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
		</dependency>
		<!-- 添加springboot jdbcTemplate依赖 -->
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-jdbc</artifactId>
		</dependency>
		<!-- fastjson依赖 -->
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>fastjson</artifactId>
			<version>1.2.15</version>
		</dependency>

		<!-- 解决thymeleaf模板引擎对h5页面检查太严格问题 -->
		<dependency>
			<groupId>net.sourceforge.nekohtml</groupId>
			<artifactId>nekohtml</artifactId>
			<version>1.9.22</version>
		</dependency>

		<!-- druid依赖加入 -->
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>druid</artifactId>
			<version>1.1.6</version>
		</dependency>

		<!-- 解决的是热启动、热布署的问题 -->
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-devtools</artifactId>
			<optional>true</optional>
		</dependency>
	<!-- webmagic+selenium配置依赖 -->
	<dependency>
	    <groupId>us.codecraft</groupId>
	    <artifactId>webmagic-extension</artifactId>
	    <version>0.7.3</version>
	</dependency>
	<dependency>
		<groupId>us.codecraft</groupId>
		<artifactId>webmagic-selenium</artifactId>
		<version>0.7.3</version>
	</dependency>
	<dependency>
		<groupId>org.seleniumhq.selenium</groupId>
		<artifactId>selenium-java</artifactId>
		<version>3.0.1</version>
	</dependency>
	<dependency>
		<groupId>org.seleniumhq.selenium</groupId>
		<artifactId>selenium-chrome-driver</artifactId>
		<version>3.0.1</version>
	</dependency>
	<dependency>
		<groupId>org.seleniumhq.selenium</groupId>
		<artifactId>selenium-server</artifactId>
		<version>2.18.0</version>
  </dependency>
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-core</artifactId>
    <version>0.7.3</version>
</dependency>
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.7.3</version>
    <exclusions>
        <exclusion>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
        </exclusion>
    </exclusions>
</dependency>
	</dependencies>

	<!-- 打成可执行的jar,并指定主启动类 -->
	<build>
		<plugins>
			<!-- <plugin>
				<groupId>org.springframework.boot</groupId>
				<artifactId>spring-boot-maven-plugin</artifactId>
				<configuration>
					<mainClass>com.tianliangedu.boot.ApplicationBootControler</mainClass>
				</configuration>
			</plugin> -->
<plugin>
				<artifactId>maven-compiler-plugin</artifactId>
				<version>2.3.2</version>
				<configuration>
					<source>1.7</source>
					<target>1.7</target>
					<encoding>UTF-8</encoding>
				</configuration>
			</plugin>
			<!-- <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> 
				<configuration> <mainClass>com.tl.job008.controller.root.SystemBootController</mainClass> 
				</configuration> </plugin> -->
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-resources-plugin</artifactId>
				<!-- 解决资源文件的编码问题 -->
				<configuration>
					<encoding>UTF-8</encoding>
				</configuration>
				<executions>
					<execution>
						<id>copy-resources</id>
						<phase>validate</phase>
						<goals>
							<goal>copy-resources</goal>
						</goals>
						<configuration>
							<!-- 把资源文件打包到工程目录下,与下面的打包不包含资源文件对应,也可以是其它目录,比方conf,这样写${project.build.directory}/conf -->
							<outputDirectory>${project.build.directory}</outputDirectory>
							<resources>
								<resource>
									<directory>src/main/resources</directory>
									<filtering>true</filtering>
								</resource>
							</resources>
						</configuration>
					</execution>
				</executions>
			</plugin>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-jar-plugin</artifactId>
				<configuration>
					<archive>
						<manifest>
							<addClasspath>true</addClasspath>
							<classpathPrefix>lib/</classpathPrefix>
							<mainClass>com.tianliangedu.boot.ApplicationBootControler</mainClass><!-- 
								main函数入口 -->
							<addDefaultImplementationEntries>true</addDefaultImplementationEntries>
							<addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
						</manifest>
						<manifestEntries>
							<Permissions>${Permissions}</Permissions>
							<Caller-Allowable-Codebase>${Caller-Allowable-Codebase}</Caller-Allowable-Codebase>
							<Class-Path>./</Class-Path>
						</manifestEntries>
					</archive>
				</configuration>
			</plugin>
			<!-- 解决资源文件的编码问题 -->
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-compiler-plugin</artifactId>
				<version>2.3.2</version>
				<configuration>
					<source>${java_source_version}</source>
					<target>${java_target_version}</target>
					<encoding>${file_encoding}</encoding>
					<showDeprecation>true</showDeprecation>
					<showWarnings>true</showWarnings>
				</configuration>
			</plugin>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-dependency-plugin</artifactId>
				<executions>
					<execution>
						<id>copy-dependencies</id>
						<phase>package</phase>
						<configuration>
							<overWriteReleases>false</overWriteReleases>
							<overWriteSnapshots>false</overWriteSnapshots>
							<overWriteIfNewer>true</overWriteIfNewer>
							<!-- 将引用依赖的jar包打在lib目录下 -->
							<outputDirectory>
								${project.build.directory}/lib
							</outputDirectory>
						</configuration>
						<goals>
							<goal>copy-dependencies</goal>
						</goals>
					</execution>
				</executions>
			</plugin>


		</plugins>
	</build>

</project>

boss单个词条貌似只能爬取10-14页内容,这个限制目前没研究,找工作是够用了
在这里插入图片描述
这点数据,明天搞到hive里来一遍,可视化一波也还行.
哦对,菜鸟一个,如果有大佬能告诉俺oos的问题就好了,Pipeline这个对象,甭管我咋搞,用自带的filepipelin还是自定义的都不输出,自己装的mongodb也无用武之处…

附加好玩代码
boos直聘–seleiunm代码

package spider_Data;

import org.openqa.selenium.By;
import org.openqa.selenium.Keys;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;

import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;

public class likePeople {
	public static void main(String[] args) throws Exception {
	//设置selemium浏览器配置驱动
	SeleniumDownloader seleniumDownloader=new SeleniumDownloader("G:\\爬虫\\drive\\chromedriver.exe");
	//配置当前浏览器配置
	System.setProperty("selenuim_config", "G:\\workspace\\Git\\webmagic\\config.ini");	
	//request类型配置,使用responsebody配置请求头,调用method方法选择post/get请求
	Request request=new Request();
	//设置被爬取页面
	String st="https://www.zhipin.com/";
	
	WebDriver driver = new ChromeDriver();
	Thread.sleep(500);
	driver.manage().window().maximize();
	Thread.sleep(500);
	driver.get(st);
	Thread.sleep(500);
	System.out.println("输入搜索内容");

	driver.findElement(By.name("query")).sendKeys("石家庄大数据");
	driver.findElement(By.name("query")).sendKeys(Keys.ENTER);
	WebElement obj=driver.findElement(By.xpath("//*[@id=\"wrap\"]/div[3]/div/div/div[1]/form/button"));
	driver.switchTo().frame(obj);
//	driver.findElement(By.cssSelector(".button")).click();
//	driver.findElement(By.xpath("//*[@id=\"wrap\"]/div[3]/div/div/div[1]"))
//	.findElement(By.cssSelector("*[@id=\"wrap\"]/div[3]/div/div/div[1]/form/button")).click();

	Thread.sleep(3000);

	}
}

奥对,还有init配置文件在这里插入图片描述

# What WebDriver to use for the tests
#driver=phantomjs
#driver=firefox
driver=chrome
#driver=http://localhost:8910
#driver=http://localhost:4444/wd/hub

# PhantomJS specific config (change according to your installation)
#phantomjs_exec_path=/Users/Bingo/bin/phantomjs-qt5
#phantomjs_exec_path=d:/phantomjs.exe
chrome_exec_path=C:\Users\sky\AppData\Local\Google\Chrome\Application\chrome.exe
#phantomjs_driver_path=/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/src/main.js
#phantomjs_driver_loglevel=DEBUG
chrome_driver_loglevel=DEBUG

这个文件随便扔个地方吧,总之项目中的相对路径,怎默写都找不到它这个龟儿子…tmd,写绝对路径包全家头发安康
driver自己下载匹配浏览器的版本就行

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值