简介:之前玩过python爬虫,但是不太想在学习python了,爬虫这方面就不概述了
HttpClient
入门程序
- 创建工程,引入依赖
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.6.6</version>
</dependency>
</dependencies>
- resource 下日志文件 log4j.properties
log4j.rootLogger=DEBUG,A1
log4j.logger.com.zhj = DEBUG
log4j.appender.A1 = org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout = org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [%t] [%c]-[%p] %m%n
- 程序
public class CrawlerFirst {
public static void main(String[] args) throws IOException {
// 创建client对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 输入网址
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
// 回车
CloseableHttpResponse response = httpClient.execute(httpGet);
// 判断响应 获取数据
StatusLine statusLine = response.getStatusLine();
if (statusLine.getStatusCode() == 200) {
HttpEntity entity = response.getEntity();
String content = EntityUtils.toString(entity, "utf8");
System.out.println(content);
}
}
}
HttpGet
public class HttpGetTest {
public static void main(String[] args) {
CloseableHttpClient httpClient= HttpClients.createDefault();
HttpGet httpGet = new HttpGet("http://www.baidu.com");
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content);
}
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
带参数Get
public class CrawlerFirst {
public static void main(String[] args) throws Exception {
// 创建client对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 带参数 GET
URIBuilder uriBuilder = new URIBuilder("http://www.baidu.com/s");
uriBuilder.setParameter("wd","许龄月");
uriBuilder.setParameter("ie","UTF-8");
HttpGet httpGet = new HttpGet(uriBuilder.build());
// 回车
CloseableHttpResponse response = httpClient.execute(httpGet);
// 判断响应 获取数据
StatusLine statusLine = response.getStatusLine();
if (statusLine.getStatusCode() == 200) {
HttpEntity entity = response.getEntity();
String content = EntityUtils.toString(entity, "utf8");
System.out.println(content);
}
}
}
HttpPost
HttpPost httpPost = new HttpPost("http://www.baidu.com");
HttpPost 带参数
public class HttpPostTest {
public static void main(String[] args) throws Exception {
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
// post 请求参数封装
List<NameValuePair> list = new ArrayList<NameValuePair>();
list.add(new BasicNameValuePair("keys","java"));
UrlEncodedFormEntity urlEncodedFormEntity = new UrlEncodedFormEntity(list);
httpPost.setEntity(urlEncodedFormEntity);
CloseableHttpResponse response = httpClient.execute(httpPost);
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content);
}
}
}
连接池
public class HttpPostPoolTest {
public static void main(String[] args) throws Exception {
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
// 设置最大连接数
cm.setMaxTotal(100);
// 设置每个主机的最大连接数
cm.setDefaultMaxPerRoute(10);
doGet(cm);
doGet(cm);
}
private static void doGet(PoolingHttpClientConnectionManager cm) throws Exception {
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
System.out.println(httpClient);
HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
// post 请求参数封装
List<NameValuePair> list = new ArrayList<NameValuePair>();
list.add(new BasicNameValuePair("keys","java"));
UrlEncodedFormEntity urlEncodedFormEntity = new UrlEncodedFormEntity(list);
httpPost.setEntity(urlEncodedFormEntity);
CloseableHttpResponse response = httpClient.execute(httpPost);
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
// 这里为了方便 没有写try finally,注意不能关掉httpClient,是交给连接池处理的
}
}
请求参数
// 请求信息
RequestConfig build = RequestConfig.custom().setConnectionRequestTimeout(1000) // 创建连接最长时间,单位是毫秒
.setConnectionRequestTimeout(500) // 设置获取连接的最长时间,单位是毫秒
.setSocketTimeout(10 * 1000) // 设置数据传输的最长时间 ,单位是毫秒
.build();
httpPost.setConfig(build);
JSoup
java的html解析工具
新增依赖
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.11</version>
</dependency>
- 解析dom
- 解析String
- 解析文件
public class JsoupParse {
// 解析url
@Test
public void testUrl () throws Exception {
// url , 超时时间
Document doc = Jsoup.parse(new URL("http://www.itcast.cn"), 1000);
String title = doc.getElementsByTag("title").first().text();
System.out.println(title);
}
// 解析String
@Test
public void testString () throws Exception {
String string = FileUtils.readFileToString(new File("/Users/mac/IdeaProjects/spring-demos/czbk.html"), "utf8");
Document doc = Jsoup.parse(string);
String title = doc.getElementsByTag("title").first().text();
System.out.println(title);
}
// 解析File
@Test
public void testFile () throws Exception {
Document doc = Jsoup.parse(new File("/Users/mac/IdeaProjects/spring-demos/czbk.html"), "utf8");
String title = doc.getElementsByTag("title").first().text();
System.out.println(title);
}
}
通过标签获取
通过Css获取
document.select()
常用方法:
tagname:通过标签查找元素,比如:span
#id:通过ID查找元素,比如:#city_bj
.class: 通过class名称查找元素,比如 : .class_a
[attribute]:利用属性查找元素,比如:[abc]
[attr=value]:利用属性值查找元素,比如:[class=s_name]
组合标签的使用
el#id: 元素+id,比如h3#city_bj
el.class:元素+class
el[attr]:元素+属性名
任意组合:比如:span[abc].s_name
ancestor child:查找某个元素下的子元素
parent > child: 查找某个父元素下的直接子元素
.city_con > ul > li: 查找city_con一级的ul,在找所有的ul下的第一级的li
parent > *:查找某个父元素下的所有直接子元素
案例工具类
HttpUtils
@Component
public class HttpUtils {
private PoolingHttpClientConnectionManager cm;
public HttpUtils(){
this.cm=new PoolingHttpClientConnectionManager();
this.cm.setMaxTotal(100); // 设置最大连接数
this.cm.setDefaultMaxPerRoute(10); // 设置每个主机的最大连接数
}
/**
* 获取html
* @param url
* @return
*/
public String doGetHtml(String url){
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(this.getConfig());
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200) {
HttpEntity entity = response.getEntity();
String text = EntityUtils.toString(entity, "utf8");
return text;
}
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return "";
}
/**
* 获取Image
* @param url
* @return
*/
public String doGetImage(String url){
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(this.getConfig());
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200) {
HttpEntity entity = response.getEntity();
if (!Objects.isNull(entity)) {
String extName = url.substring(url.lastIndexOf("."));
String picName = UUID.randomUUID().toString() + extName;
FileOutputStream fileOutputStream = new FileOutputStream(new File("/Users/mac/Desktop/img" + picName));
response.getEntity().writeTo(fileOutputStream);
return picName;
}
}
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return "";
}
private RequestConfig getConfig() {
return RequestConfig.custom()
.setConnectTimeout(1000)
.setConnectionRequestTimeout(500)
.setSocketTimeout(10 * 1000)
.build();
}
}
定时任务的使用
- 启动类上 加 @EnableScheduling
- 定时任务类 加上 @Component
- 定时方法
@Component
public class ItemTask {
@Autowired
HttpUtils httpUtils;
// 间隔时间执行
@Scheduled(fixedDelay = 10*1000)
public void itemTask() {
// String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&pvid=6ee395300307469ab7f60f1c5a2a7365&s=116&click=0&page";
int page = 1;
String url = "http://www.shuhai.com/shuku/0_113_0_0_0_2_0_"+page+".html";
for (int i = 1; i < 10;i++) {
page = i;
String html = httpUtils.doGetHtml(url + i);
this.parse(html);
}
}
private void parse(String html) {
Document document = Jsoup.parse(html);
System.out.println(html);
Elements elements = document.select("body > div.content > div > div:nth-child(1) > div.c8.shadow.bgfff > div.book-list-wrapper > div");
for (int i = 0; i < elements.size(); i++) {
Elements select = elements.get(i).select("div > div.flex > div.book-name > a");
System.out.println(elements.text());
}
}
// //表示方法执行完成后5秒
// @Scheduled(fixedDelay = 5000)
// public void fixedDelayJob() throws InterruptedException {
// System.out.println("fixedDelay 每隔5秒" + new Date());
// }
//
// //表示每隔3秒
// @Scheduled(fixedRate = 3000)
// public void fixedRateJob() {
//
// System.out.println("fixedRate 每隔3秒" + new Date());
// }
//
// //表示每天8时30分0秒执行
// @Scheduled(cron = "0 0,30 0,8 ? * ? ")
// public void cronJob() {
// System.out.println(new Date() + " ...>>cron....");
// }
}
解析之后正常保存到数据库即可