java爬虫Jsoup
百度热榜
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HotsNew {
public static void main(String[] args) {
List<Map<String, String>> baiduhotnews = Baiduhotnews();
String Url = "jdbc:mysql://localhost/test";
String name = "root";
String psd = "root";
String jdbcName = "com.mysql.jdbc.Driver";
String sql = "insert into sys_newsmessage (title,seachNum,hottime) values(?,?,?)";
try {
SimpleDateFormat dataformatter = new SimpleDateFormat("yyyy-MM-dd");
String date1 = dataformatter.format(new Date());
Class.forName(jdbcName);
Connection con = DriverManager.getConnection(Url, name, psd);
PreparedStatement pst = con.prepareStatement(sql);
for (Map<String, String> entry : baiduhotnews) {
for (String key : entry.keySet()) {
String mapValue = entry.get(key);
System.out.println(key + ":" + mapValue);
pst.setString(1, key);
pst.setInt(2, Integer.parseInt(mapValue));
pst.setString(3, date1);
pst.executeUpdate();
}
}
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
}
public final static String baseUrl = "http://top.baidu.com/buzz?b=1&c=513&fr=topbuzz_b42_c513";
public static int timeout = 30 * 1000;
public static int times = 10;
public static String UserAgent[] = {
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32" };
public static String getRandomUA() {
return UserAgent[(int) (Math.random() * (UserAgent.length))];
}
public static Document getDocument(String url) {
Document doc = null;
for (int i = 0; i < times; i++) {
try {
doc = Jsoup.connect(url).header("User-Agent", getRandomUA()).timeout(timeout).get();
if (doc != null)
break;
} catch (Exception e) {
e.printStackTrace();
}
}
return doc;
}
public static List<Map<String, String>> Baiduhotnews() {
List<Map<String, String>> result = new ArrayList<>();
Map<String, String> map = new HashMap<>();
Document document = getDocument(baseUrl);
Elements els = document.getElementsByTag("tbody");
for (Element el : els) {
Elements ByTagTr = el.getElementsByTag("tr");
for (Element eltr : ByTagTr) {
Elements elementsByClass = eltr.getElementsByClass("keyword");
String titlekey = "";
String seachNum1 = "";
for (Element eldata : elementsByClass) {
titlekey = eldata.getElementsByTag("a").text();
titlekey = titlekey.substring(0, titlekey.length() - 6);
}
Elements seachNum = eltr.getElementsByClass("last");
for (Element dataseachNum : seachNum) {
seachNum1 = dataseachNum.getElementsByClass("last").text();
}
if (titlekey != "" && titlekey != null) {
if (seachNum1 != "" && seachNum1 != null) {
map.put(titlekey, seachNum1);
}
}
}
}
result.add(map);
return result;
}
}