java爬取资讯（新闻）

最新推荐文章于 2024-05-20 22:06:37 发布

dongwn

最新推荐文章于 2024-05-20 22:06:37 发布

阅读量1.7k

点赞数

分类专栏： javaweb 文章标签： java

本文链接：https://blog.csdn.net/m0_37615458/article/details/103906358

版权

javaweb 专栏收录该内容

20 篇文章 2 订阅

订阅专栏

写在前面的叫前言

本篇博客只是交流学习，如有不妥请联系删除

续接前两篇爬图片和音乐，本次爬了一些文字信息，如果后期有需要再把图片加上
刚才简单的把资讯爬了一下，比较简单的抓取列表页的标题和详情页的文字内容，存到桌面上txt文件
jar包支撑与前两篇一致，不再赘述。
https://blog.csdn.net/m0_37615458/article/details/103867889
https://blog.csdn.net/m0_37615458/article/details/103902165

一、主方法

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;

import javax.swing.filechooser.FileSystemView;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import spiderkugou.HtmlManage;
import spiderkugou.HttpGetConnect;

/**
 * @author dongwn 拟抓取页面的资讯数据
 */
public class TestGetInformation {

    static String url = "http://news.chemnet.com/list-11-11-PAGE.html";// 列表页

    public static void main(String[] args) throws IOException, ParseException, InterruptedException {
        String newUrl = "";// 动态url
        for (int i = 1; i < 50; i++) {// 只抓50页吧
            newUrl = url.replace("PAGE", i + "");
            getConnection(newUrl);
            Thread.sleep(1000);
        }
    }

    // 获取链接获取元素
    @SuppressWarnings("static-access")
    public static void getConnection(String url) throws IOException, ParseException {
        StringBuffer sb = new StringBuffer();
        HttpGetConnect connect = new HttpGetConnect();
        String content = connect.connect(url, "utf-8");
        HtmlManage html = new HtmlManage();
        Document doc = html.manage(content);// 转 Document
        Elements elements = doc.select(".content-list>ul>li");
        sb.append(System.getProperty("user.name") + "----->>>"
                + new SimpleDateFormat("yyyy-MM-dd hh:mm:ss").format(new Date()));
        sb.append("\r\n");// 换行
        sb.append("\r\n");// 换行
        for (Element ele : elements) {
            sb.append("标题---->>>");
            sb.append(ele.select("a").text().trim());// 获取标题
            sb.append("\r\n");// 换行
            String detailUrl = "http://news.chemnet.com" + ele.select("a").attr("href");
            String detailcontent = connect.connect(detailUrl, "utf-8");
            HtmlManage detailhtml = new HtmlManage();
            Document detailDoc = detailhtml.manage(detailcontent);// 转 Document
            String detailContent = detailDoc.select(".detail-text>div").get(0).text();
            sb.append("内容---->>>");
            sb.append(detailContent);
            sb.append("\r\n");// 换行
            sb.append("\r\n");// 换行
            sb.append("\r\n");// 换行
        }
        writeToTxt(sb);
    }

    public static void writeToTxt(StringBuffer sb) {
        FileWriter fw = null;
        File desktopDir = FileSystemView.getFileSystemView().getHomeDirectory();
        String desktopPath = desktopDir.getAbsolutePath();
        try {
            File f = new File(desktopPath + "/" + new SimpleDateFormat("yyyy-MM-dd").format(new Date()) + "今日资讯.txt");
            fw = new FileWriter(f, true);
        } catch (IOException e) {
            e.printStackTrace();
        }
        PrintWriter pw = new PrintWriter(fw);
        pw.println(sb);
        pw.flush();
        try {
            fw.flush();
            pw.close();
            fw.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

}

二、工具类（两个）

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.BasicHttpClientConnectionManager;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

/**
 * httpclient 工具类
 */
public class HttpGetConnect {

    /**
     * 获取html内容
     * 
     * @param url
     * @param charsetName
     *            UTF-8、GB2312
     * @return
     * @throws IOException
     */
    public static String connect(String url, String charsetName) throws IOException {
        BasicHttpClientConnectionManager connManager = new BasicHttpClientConnectionManager();

        CloseableHttpClient httpclient = HttpClients.custom().setConnectionManager(connManager).build();
        String content = "";

        try {
            HttpGet httpget = new HttpGet(url);

            RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(5000).setConnectTimeout(50000)
                    .setConnectionRequestTimeout(50000).build();
            httpget.setConfig(requestConfig);
            httpget.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            httpget.setHeader("Accept-Encoding", "gzip,deflate,sdch");
            httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
            httpget.setHeader("Connection", "keep-alive");
            httpget.setHeader("Upgrade-Insecure-Requests", "1");
            httpget.setHeader("User-Agent",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
            httpget.setHeader("cache-control", "max-age=0");

            httpget.setHeader("Referer", "https://www.kugou.com/song/");

            // 设置cookie
            httpget.setHeader("Cookie", "kg_mid=9393340fecff864a4d6c4e95099b2be1;");

            CloseableHttpResponse response = httpclient.execute(httpget);

            int status = response.getStatusLine().getStatusCode();
            if (status >= 200 && status < 300) {

                HttpEntity entity = response.getEntity();
                InputStream instream = entity.getContent();
                BufferedReader br = new BufferedReader(new InputStreamReader(instream, charsetName));
                StringBuffer sbf = new StringBuffer();
                String line = null;
                while ((line = br.readLine()) != null) {
                    sbf.append(line + "\n");
                }

                br.close();
                content = sbf.toString();
            } else {
                content = "";
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            httpclient.close();
        }
        log.info("content is " + content);
        return content;
    }

    private static Log log = LogFactory.getLog(HttpGetConnect.class);
}

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * html manage 工具类
 */
public class HtmlManage {

    public Document manage(String html) {
        Document doc = Jsoup.parse(html);
        return doc;
    }

    public Document manageDirect(String url) throws IOException {
        Document doc = Jsoup.connect(url).get();
        return doc;
    }

    public List<String> manageHtmlTag(Document doc, String tag) {
        List<String> list = new ArrayList<String>();

        Elements elements = doc.getElementsByTag(tag);
        for (int i = 0; i < elements.size(); i++) {
            String str = elements.get(i).html();
            list.add(str);
        }
        return list;
    }

    public List<String> manageHtmlClass(Document doc, String clas) {
        List<String> list = new ArrayList<String>();

        Elements elements = doc.getElementsByClass(clas);
        for (int i = 0; i < elements.size(); i++) {
            String str = elements.get(i).html();
            list.add(str);
        }
        return list;
    }

    public List<String> manageHtmlKey(Document doc, String key, String value) {
        List<String> list = new ArrayList<String>();

        Elements elements = doc.getElementsByAttributeValue(key, value);
        for (int i = 0; i < elements.size(); i++) {
            String str = elements.get(i).html();
            list.add(str);
        }
        return list;
    }

    private static Log log = LogFactory.getLog(HtmlManage.class);
}

三、效果图

至此结束

dongwn

关注

0
点赞
踩
11

收藏

觉得还不错? 一键收藏
0
评论
java爬取资讯（新闻）

本篇博客只是交流学习，如有不妥请联系删除续接前两篇爬图片和音乐，本次爬了一些文字信息，如果后期有需要再把图片加上https://blog.csdn.net/m0_37615458/article/details/103867889https://blog.csdn.net/m0_37615458/article/details/103902165刚才简单的把资讯爬了一下，比较简单的...
复制链接

扫一扫

专栏目录