public class HttpClient {
public static int startRecord = 1;
public static int endRecord = 75;
public static int maxPage = 0;
public static String headUrl = "http://www.ytgxq.gov.cn";
public static void main(String[] args) {
try {
while (maxPage == 0 || endRecord <= maxPage){
saveMessage();
startRecord+=75;
endRecord+=75;
}
maxPage = 0;
}catch (Exception e){
e.printStackTrace();
}
}
public synchronized static void saveMessage(){
try {
//获取列表页的Entity解析的字符串对象
String listPage = getListPageEntity(startRecord, endRecord);
if (listPage != null){
Document listDocument = Jsoup.parse(listPage.replace("<![CDATA[","").replace("]]>",""));
if (maxPage == 0){
String totalrecord = listDocument.select("totalrecord").text();
maxPage = Integer.parseInt(totalrecord);
}
Elements elements = listDocument.select("record");
for (Element element : elements) {
if (element != null && elements.outerHtml().isEmpty() == false){
Elements a = element.select("a");
String title = a.attr("title"); //列表标题
String href = a.attr("href"); //列表标题链接
String date = element.select("record span").text(); //发布时间
int count = JdbcUtils.selectById(href); //判断数据库中是否存在该网站记录
if (count == 0){
//创建实体类对象
XinXiInfoTest xin = new XinXiInfoTest();
String detailLink = headUrl+href;
xin.setId(href);
HttpClient爬取网页信息并解析加入数据库
最新推荐文章于 2023-04-06 00:21:28 发布