如何定时爬数据+java_java定时爬取百度风云榜的数据

maven

org.springframework.boot

spring-boot-starter-jdbc

mysql

mysql-connector-java

org.springframework.boot

spring-boot-starter-test

test

cn.wanghaomiao

JsoupXpath

2.3.2

cn.hutool

hutool-all

4.5.1

代码

package com.hskj.tvdate.reptile;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.seimicrawler.xpath.JXDocument;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.jdbc.core.JdbcTemplate;

import org.springframework.scheduling.annotation.Scheduled;

import org.springframework.stereotype.Component;

import java.io.UnsupportedEncodingException;

import java.net.URL;

import java.net.URLEncoder;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.concurrent.ExecutorService;

import java.util.concurrent.Executors;

import cn.hutool.core.date.DateUtil;

import cn.hutool.http.HttpUtil;

import cn.hutool.json.JSONObject;

import cn.hutool.json.JSONUtil;

/**

* @program: tvdate

* @description:爬取百度风云榜的数据

* @author: hw

* @create: 2020-01-29 17:18

*/

@Component

public class BaiDuSituation {

@Autowired JdbcTemplate jdbcTemplate;

static ExecutorService executorService = Executors.newFixedThreadPool(30);

/** 影视类型和网址的key-value */

static final Map MAP =

new HashMap() {

{

put("电影", "http://top.baidu.com/buzz?b=26&c=1&fr=topcategory_c1");

put("电视剧", "http://top.baidu.com/buzz?b=4&c=2&fr=topcategory_c2");

put("综艺", "http://top.baidu.com/buzz?b=19&c=3&fr=topcategory_c3");

put("动漫", "http://top.baidu.com/buzz?b=23&c=5&fr=topcategory_c5");

put("少儿", "http://top.baidu.com/buzz?b=1677&fr=topbuzz_b23_c5");

put("纪录片", "http://top.baidu.com/buzz?b=1678&fr=topbuzz_b23_c5");

}

};

/** 影视类型和影视简介的key-value value后需加上影视名称的UrlEncode后的字符, 并且链接返回的的数据是unicode编码,需要转译 */

static final Map KEY_VALUE =

new HashMap() {

{

put("电影", "http://top.baidu.com/detail/intro?boardid=26&keyword=");

put("电视剧", "http://top.baidu.com/detail/intro?boardid=4&keyword=");

put("综艺", "http://top.baidu.com/detail/intro?boardid=19&keyword=");

put("动漫", "http://top.baidu.com/detail/intro?boardid=23&keyword=");

put("少儿", "http://top.baidu.com/detail/intro?boardid=1677&keyword=");

put("纪录片", "http://top.baidu.com/detail/intro?boardid=1678&keyword=");

}

};

private static final Logger log = LoggerFactory.getLogger(BaiDuSituation.class);

/** 爬取数据的方法 */

@Scheduled(cron = "00 00 12 * * ?")

public void addBaiduData() throws Exception {

String today = DateUtil.today();

log.info("百度风云榜爬取数据定时任务开始执行");

for (Map.Entry url : MAP.entrySet()) {

String urls = url.getValue().toString();

String type = url.getKey();

// 请求链接

Document document = Jsoup.parse(new URL(urls).openStream(), "GBK", urls);

JXDocument underTest = JXDocument.create(document.toString());

// 标题 名称

String title = "//td[@class='keyword']/a[1]/text()";

String index = "//td[@class='last']/span/text()";

List titles = underTest.sel(title);

List indexs = underTest.sel(index);

for (int i = 0; i < titles.size(); i++) {

int finalI = i;

Thread thread =

new Thread(

() -> {

// 标题

Object titleName = titles.get(finalI);

// 指数

Object index2 = indexs.get(finalI);

String urlDeCode = strToUrlDeCode(titleName.toString());

String urli = KEY_VALUE.get(type) + urlDeCode;

Map jsonToMap = summaryJsonToMap(HttpUtil.get(urli));

Object urlBaike = null;

Object imageUrl = null;

Object imageBig = null;

Object brief = null;

try {

// 百科

urlBaike = jsonToMap.get("url");

// 小图封面

imageUrl = jsonToMap.get("image");

// 大图封面

imageBig = jsonToMap.get("orin_image");

// 简介

brief = jsonToMap.get("abstract");

} catch (Exception e) {

log.info("该影片没有简介:{}", titleName);

}

addBaiduDateBase(

titleName, type, brief, index2, urlBaike, imageUrl, imageBig, finalI, today);

});

executorService.execute(thread);

}

}

}

/**

* 将中文字符转成urlcode

*

* @param str 中文字符

* @return urlcode

*/

public static String strToUrlDeCode(String str) {

String deCode = "";

try {

deCode = URLEncoder.encode(str, "gb18030");

} catch (UnsupportedEncodingException e) {

e.printStackTrace();

return deCode;

}

return deCode;

}

/**

* 将简介的json转成map

*

* @param json 简介的json

* @return 转成后的map key: url.百度百科 image.小图封面 orin_image.大图封面 abstract.简介

*/

public static Map summaryJsonToMap(String json) {

JSONObject content = null;

try {

JSONObject jsonObject = JSONUtil.parseObj(json);

content = JSONUtil.parseObj(jsonObject.get("content"));

} catch (Exception e) {

}

return content;

}

public void addBaiduDateBase(Object... obj) {

StringBuffer sb =

new StringBuffer(

"insert into baidu_situation(create_day,no,title,`type`,brief,`index`,url_baike,image_url,image_big)values(");

sb.append(

"'" + obj[8] + "','" + obj[7] + "','" + obj[0] + "','" + obj[1] + "','" + obj[2] + "','"

+ obj[3] + "','" + obj[4] + "','" + obj[5] + "','" + obj[6] + "'");

sb.append(")");

int update = jdbcTemplate.update(sb.toString());

if (update < 1) {

log.error("数据库插入数据库失败,参数为:{}", obj);

}

}

}

数据库

CREATE TABLE `baidu_situation` (

`id` int(11) NOT NULL AUTO_INCREMENT,

`title` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '标题名称',

`type` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '类型',

`no` int(11) DEFAULT NULL COMMENT '排名',

`brief` varchar(2550) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '简介',

`index` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '热度指数',

`url_baike` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '百科链接',

`image_url` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '封面链接',

`image_big` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '大图封面链接',

`create_day` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '创建的年月日 用于做索引',

`remark` varchar(255) CHARACTER SET utf8 DEFAULT NULL COMMENT '备注',

`create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',

`state` int(11) NOT NULL DEFAULT '0' COMMENT '数据有效性(0-有效,1-无效)',

PRIMARY KEY (`id`),

UNIQUE KEY `titleDay` (`title`,`type`,`create_day`) USING BTREE COMMENT '唯一索引做标识防止重复爬取'

) ENGINE=InnoDB AUTO_INCREMENT=301 DEFAULT CHARSET=utf8 COLLATE=utf8_czech_ci;

513cfa7183db4e43f9ede5dd2e75b45a.png

4a668feaa3018a2e1706f07e5002bcc8.png

beyond丿q:1559810637

发布了200 篇原创文章 · 获赞 224 · 访问量 16万+

私信

关注

标签:baidu,COMMENT,java,DEFAULT,风云榜,爬取,import,NULL,com

来源: https://blog.csdn.net/qq_41594146/article/details/104111746

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,以下是使用Python和Beautiful Soup爬取豆瓣音乐排行榜的代码和过程解析: 1. 导入所需的库 ``` import requests from bs4 import BeautifulSoup ``` 2. 通过requests库向网站发送请求,获取网页内容 ``` url = 'https://music.douban.com/chart' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} response = requests.get(url, headers=headers) html = response.text ``` 3. 使用Beautiful Soup解析网页内容 ``` soup = BeautifulSoup(html, 'html.parser') ``` 4. 定位需要爬取的信息 豆瓣音乐排行榜中包含歌曲的排名、歌曲名称、歌手和评分等信息,我们需要定位到这些信息所在的HTML标签和类名。 ``` songs = soup.find_all('tr', class_='item') ``` 5. 提取信息并进行存储 ``` for song in songs: rank = song.find('span', class_='ranking_num').get_text().strip() title = song.find('a', class_='nbg').get_text().strip() artist = song.find('p', class_='pl').get_text().strip() rating = song.find('span', class_='rating_nums').get_text().strip() print(rank, title, artist, rating) ``` 完整代码如下: ``` import requests from bs4 import BeautifulSoup url = 'https://music.douban.com/chart' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} response = requests.get(url, headers=headers) html = response.text soup = BeautifulSoup(html, 'html.parser') songs = soup.find_all('tr', class_='item') for song in songs: rank = song.find('span', class_='ranking_num').get_text().strip() title = song.find('a', class_='nbg').get_text().strip() artist = song.find('p', class_='pl').get_text().strip() rating = song.find('span', class_='rating_nums').get_text().strip() print(rank, title, artist, rating) ``` 此代码可爬取豆瓣音乐排行榜前100首歌曲的信息。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值