百度热榜爬虫

java爬虫Jsoup

百度热榜

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class HotsNew {

	public static void main(String[] args) {
		List<Map<String, String>> baiduhotnews = Baiduhotnews();

		String Url = "jdbc:mysql://localhost/test";// 参数参考MySql连接数据库常用参数及代码示例
		String name = "root";// 数据库用户名
		String psd = "root";// 数据库密码
		String jdbcName = "com.mysql.jdbc.Driver";// 连接MySql数据库
		String sql = "insert into sys_newsmessage (title,seachNum,hottime) values(?,?,?)";// 数据库操作语句(插入)
		try {
			SimpleDateFormat dataformatter = new SimpleDateFormat("yyyy-MM-dd");
			String date1 = dataformatter.format(new Date());
			Class.forName(jdbcName);// 向DriverManager注册自己
			Connection con = DriverManager.getConnection(Url, name, psd);// 与数据库建立连接
			PreparedStatement pst = con.prepareStatement(sql);// 用来执行SQL语句查询,对sql语句进行预编译处理
			for (Map<String, String> entry : baiduhotnews) {
				for (String key : entry.keySet()) {
					String mapValue = entry.get(key);
					System.out.println(key + ":" + mapValue);
					pst.setString(1, key);
					pst.setInt(2, Integer.parseInt(mapValue));
					pst.setString(3, date1);
					pst.executeUpdate();// 解释在下
				}
			}

		} catch (ClassNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (SQLException e) {// 执行与数据库建立连接需要抛出SQL异常
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

	/** 百度搜索基本url 后面可以接的参数有 pn rn ie 等 */
	public final static String baseUrl = "http://top.baidu.com/buzz?b=1&c=513&fr=topbuzz_b42_c513";
	/** 连接超时时间 */
	public static int timeout = 30 * 1000;
	/** 连接重试次数 */
	public static int times = 10;
	/** UA */
	public static String UserAgent[] = {
			"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
			"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
			"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
			"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
			"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32",
			"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
			"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36",
			"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
			"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32" };

	/** 获取随机UA */
	public static String getRandomUA() {
		return UserAgent[(int) (Math.random() * (UserAgent.length))];
	}

	/** 在这里进行连接 如果失败会继续重试 */
	public static Document getDocument(String url) {
		Document doc = null;
		for (int i = 0; i < times; i++) {
			try {
				doc = Jsoup.connect(url).header("User-Agent", getRandomUA()).timeout(timeout).get();
				if (doc != null)
					break;
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
		return doc;
	}

	public static List<Map<String, String>> Baiduhotnews() {
		List<Map<String, String>> result = new ArrayList<>();
		Map<String, String> map = new HashMap<>();
		Document document = getDocument(baseUrl);
		Elements els = document.getElementsByTag("tbody");
		for (Element el : els) {
			Elements ByTagTr = el.getElementsByTag("tr");
			for (Element eltr : ByTagTr) {
				Elements elementsByClass = eltr.getElementsByClass("keyword");
				String titlekey = "";
				String seachNum1 = "";
				for (Element eldata : elementsByClass) {
					titlekey = eldata.getElementsByTag("a").text();
					titlekey = titlekey.substring(0, titlekey.length() - 6);

				}
				Elements seachNum = eltr.getElementsByClass("last");
				for (Element dataseachNum : seachNum) {
					seachNum1 = dataseachNum.getElementsByClass("last").text();

				}

				if (titlekey != "" && titlekey != null) {
					if (seachNum1 != "" && seachNum1 != null) {
						map.put(titlekey, seachNum1);

					}
				}

			}
		}
		result.add(map);
		return result;
	}
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值