写在前面的叫前言
本篇博客只是交流学习,如有不妥请联系删除
- 续接前两篇爬图片和音乐,本次爬了一些文字信息,如果后期有需要再把图片加上
- 刚才简单的把资讯爬了一下,比较简单的抓取列表页的标题和详情页的文字内容,存到桌面上txt文件
- jar包支撑与前两篇一致,不再赘述。
- https://blog.csdn.net/m0_37615458/article/details/103867889
- https://blog.csdn.net/m0_37615458/article/details/103902165
一、主方法
import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import javax.swing.filechooser.FileSystemView; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import spiderkugou.HtmlManage; import spiderkugou.HttpGetConnect; /** * @author dongwn 拟抓取页面的资讯数据 */ public class TestGetInformation { static String url = "http://news.chemnet.com/list-11-11-PAGE.html";// 列表页 public static void main(String[] args) throws IOException, ParseException, InterruptedException { String newUrl = "";// 动态url for (int i = 1; i < 50; i++) {// 只抓50页吧 newUrl = url.replace("PAGE", i + ""); getConnection(newUrl); Thread.sleep(1000); } } // 获取链接获取元素 @SuppressWarnings("static-access") public static void getConnection(String url) throws IOException, ParseException { StringBuffer sb = new StringBuffer(); HttpGetConnect connect = new HttpGetConnect(); String content = connect.connect(url, "utf-8"); HtmlManage html = new HtmlManage(); Document doc = html.manage(content);// 转 Document Elements elements = doc.select(".content-list>ul>li"); sb.append(System.getProperty("user.name") + "----->>>" + new SimpleDateFormat("yyyy-MM-dd hh:mm:ss").format(new Date())); sb.append("\r\n");// 换行 sb.append("\r\n");// 换行 for (Element ele : elements) { sb.append("标题---->>>"); sb.append(ele.select("a").text().trim());// 获取标题 sb.append("\r\n");// 换行 String detailUrl = "http://news.chemnet.com" + ele.select("a").attr("href"); String detailcontent = connect.connect(detailUrl, "utf-8"); HtmlManage detailhtml = new HtmlManage(); Document detailDoc = detailhtml.manage(detailcontent);// 转 Document String detailContent = detailDoc.select(".detail-text>div").get(0).text(); sb.append("内容---->>>"); sb.append(detailContent); sb.append("\r\n");// 换行 sb.append("\r\n");// 换行 sb.append("\r\n");// 换行 } writeToTxt(sb); } public static void writeToTxt(StringBuffer sb) { FileWriter fw = null; File desktopDir = FileSystemView.getFileSystemView().getHomeDirectory(); String desktopPath = desktopDir.getAbsolutePath(); try { File f = new File(desktopPath + "/" + new SimpleDateFormat("yyyy-MM-dd").format(new Date()) + "今日资讯.txt"); fw = new FileWriter(f, true); } catch (IOException e) { e.printStackTrace(); } PrintWriter pw = new PrintWriter(fw); pw.println(sb); pw.flush(); try { fw.flush(); pw.close(); fw.close(); } catch (IOException e) { e.printStackTrace(); } } }
二、工具类(两个)
import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.http.HttpEntity; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.BasicHttpClientConnectionManager; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; /** * httpclient 工具类 */ public class HttpGetConnect { /** * 获取html内容 * * @param url * @param charsetName * UTF-8、GB2312 * @return * @throws IOException */ public static String connect(String url, String charsetName) throws IOException { BasicHttpClientConnectionManager connManager = new BasicHttpClientConnectionManager(); CloseableHttpClient httpclient = HttpClients.custom().setConnectionManager(connManager).build(); String content = ""; try { HttpGet httpget = new HttpGet(url); RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(5000).setConnectTimeout(50000) .setConnectionRequestTimeout(50000).build(); httpget.setConfig(requestConfig); httpget.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); httpget.setHeader("Accept-Encoding", "gzip,deflate,sdch"); httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.8"); httpget.setHeader("Connection", "keep-alive"); httpget.setHeader("Upgrade-Insecure-Requests", "1"); httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"); httpget.setHeader("cache-control", "max-age=0"); httpget.setHeader("Referer", "https://www.kugou.com/song/"); // 设置cookie httpget.setHeader("Cookie", "kg_mid=9393340fecff864a4d6c4e95099b2be1;"); CloseableHttpResponse response = httpclient.execute(httpget); int status = response.getStatusLine().getStatusCode(); if (status >= 200 && status < 300) { HttpEntity entity = response.getEntity(); InputStream instream = entity.getContent(); BufferedReader br = new BufferedReader(new InputStreamReader(instream, charsetName)); StringBuffer sbf = new StringBuffer(); String line = null; while ((line = br.readLine()) != null) { sbf.append(line + "\n"); } br.close(); content = sbf.toString(); } else { content = ""; } } catch (Exception e) { e.printStackTrace(); } finally { httpclient.close(); } log.info("content is " + content); return content; } private static Log log = LogFactory.getLog(HttpGetConnect.class); }
import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * html manage 工具类 */ public class HtmlManage { public Document manage(String html) { Document doc = Jsoup.parse(html); return doc; } public Document manageDirect(String url) throws IOException { Document doc = Jsoup.connect(url).get(); return doc; } public List<String> manageHtmlTag(Document doc, String tag) { List<String> list = new ArrayList<String>(); Elements elements = doc.getElementsByTag(tag); for (int i = 0; i < elements.size(); i++) { String str = elements.get(i).html(); list.add(str); } return list; } public List<String> manageHtmlClass(Document doc, String clas) { List<String> list = new ArrayList<String>(); Elements elements = doc.getElementsByClass(clas); for (int i = 0; i < elements.size(); i++) { String str = elements.get(i).html(); list.add(str); } return list; } public List<String> manageHtmlKey(Document doc, String key, String value) { List<String> list = new ArrayList<String>(); Elements elements = doc.getElementsByAttributeValue(key, value); for (int i = 0; i < elements.size(); i++) { String str = elements.get(i).html(); list.add(str); } return list; } private static Log log = LogFactory.getLog(HtmlManage.class); }
三、效果图
- 至此结束