爬虫-java-webmagic-搜视节目单(输出到单个文件)

webmgic 架构图:

webmgic 架构图

Processor:

public class TvSouProcessor implements PageProcessor {

    Map channelCodeMap = NIOUtils.csvFile2Map(TvSouConstant.CHANNELMAP_FILEPATH);

    private Site site = Site.me().setCycleRetryTimes(5).setRetryTimes(5).setSleepTime(500).setTimeOut(3 * 60 * 1000)
            .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
            .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
            .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
    // .setCharset("UTF-8")

    @Override
    public void process(Page page) {


        //TODO 选取Xpath
        String dateXpath = "/html/body/div[3]/div[3]/div[3]/div[2]/div[1]/div[2]/span/text()";

        String url = page.getUrl().toString();

        //频道
        String channel_code = url.substring(url.indexOf("epg/") + 4, url.lastIndexOf("/"));
        //获取映射的频道
        String channel = channelCodeMap.get(channel_code).toString();

        //日期
        String date = page.getHtml().xpath(dateXpath
        ).toString();


        //TODO 拼接爬取结果
        StringBuilder accum = new StringBuilder();
        int size = page.getHtml().xpath("/html/body/div[3]/div[3]/div[3]/div[2]/div[3]/ol/li").nodes().size();
        for (int i = 1; i <= size; i++) {

            //开始时间
            String timeXpathStr = "/html/body/div[3]/div[3]/div[3]/div[2]/div[3]/ol/li[" + i + "]/span/text()";
            String time = page.getHtml().xpath(timeXpathStr).toString();

            //节目
            String pgXpathStr = "/html/body/div[3]/div[3]/div[3]/div[2]/div[3]/ol/li[" + i + "]/a/text()";
            String pg = page.getHtml().xpath(pgXpathStr).toString();

            accum.append(channel).append("\t");
            accum.append(date).append("\t");
            accum.append(time).append("\t");
            accum.append(pg).append("\t");
            accum.append("\n");
        }

        page.putField("", accum.toString());

//        String yesterdayStr = LocalDateTime.now().minusDays(1).format(DateTimeFormatter.ofPattern("YYYYMMdd"));

        Calendar cal = Calendar.getInstance();
        cal.add(Calendar.DATE, -1);
        String yesterdayStr = new SimpleDateFormat("YYYYMMdd").format(cal.getTime());


        //TODO 深度爬取  不知道为什么只能匹配到一个符合规则的URL 就没有采用这种方法
//        System.out.println(page.getHtml().css("html body div.mr div.tv-more-channel div.tv-table.tv-channel-main
// .relative.ov div.sidebar.l div.channel-box.channel-boxs.cd-l-list ul.font-12 li.relative a[href]").smartContent());

//        int liLen = 50;
//        String urlRegex = "http://tvsou.com/epg/cctv-[\\d]/" + yesterdayStr + "\\?class=yangshi";
//        for (int i = 1; i <= liLen; i++) {
//            Selectable urlSelect = page.getHtml().xpath
// ("/html/body/div[3]/div[3]/div[3]/div[1]/div[1]/ul/li["+i+"]/a")
//                    .xpath("//a/@href");
//            List<String> requests = urlSelect.all();
//            page.addTargetRequests(requests);
//        }

//        System.out.println(page.getHtml().xpath("/html/body/div[3]/div[3]/div[3]/div[1]/div[1]/ul/li[17]/a").xpath
//                ("//a/@href"));
//        page.addTargetRequests(page.getHtml().links().regex("http://tvsou.com/epg/cctv-[\\d]/" + yesterdayStr +
// "\\?class=yangshi").all());

    }

    @Override
    public Site getSite() {
        return site;
    }


}

执行入口

TvSouExecutor:

public class TvSouExecutor {

    public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {

        //TODO 读取频道URL编码映射文件
        Map channelCodeMap = NIOUtils.csvFile2Map(TvSouConstant.CHANNELMAP_FILEPATH);

        Set<String> channelCodeSet = channelCodeMap.keySet();
        String[] channelCodeArr = new String[channelCodeSet.size()];
        channelCodeSet.toArray(channelCodeArr);

        int len = 15;
        List<String> urlList = new ArrayList<String>();

        //昨天的日期
//        String yesterdayStr = LocalDateTime.now().minusDays(1).format(DateTimeFormatter.ofPattern("YYYYMMdd"));

        Calendar cal = Calendar.getInstance();
        cal.add(Calendar.DATE, -1);
        String yesterdayStr = new SimpleDateFormat("YYYYMMdd").format(cal.getTime());

        String yesterdayStr2 = new SimpleDateFormat("YYYY-MM-dd").format(cal.getTime());


        for (int i = 0; i < channelCodeArr.length; i++) {
            if (channelCodeArr[i].contains("cctv")) {
                //央视
                urlList.add("http://tvsou.com/epg/" + channelCodeArr[i] + "/" + yesterdayStr + "?class=yangshi");
            } else {
                //省级卫视
                String url = "http://tvsou.com/epg/" + channelCodeArr[i] + "/" + yesterdayStr + "?class=weishi";
                urlList.add(url);
            }
        }


        String[] urls = urlList.toArray(new String[urlList.size()]);

        //downloader
        //processor
        //scheduler
        //selector
        //pipeline

        Spider.create(new TvSouProcessor())
                //可查看所有爬过的URL
//                .setScheduler(new FileCacheQueueScheduler("data/cache"))
                .addUrl(urls)
                .addPipeline(new OneFilePipeline("data/epg"+yesterdayStr2))
                .thread(1)
                .run();

    }
}

此时使用了黄大大的示例代码

OneFilePipeline:

public class OneFilePipeline extends FilePersistentBase implements Pipeline {

    private Logger logger = LoggerFactory.getLogger(getClass());

    private PrintWriter printWriter;

    public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException {
        this("/data/webmagic/");
    }

    public OneFilePipeline(String path) throws FileNotFoundException, UnsupportedEncodingException {
        setPath(path);
        printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path)), "UTF-8"));
    }

    @Override
    public synchronized void process(ResultItems resultItems, Task task) {
//        printWriter.println("url:\t" + resultItems.getRequest().getUrl());
        for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
            if (entry.getValue() instanceof Iterable) {
                Iterable value = (Iterable) entry.getValue();
//                printWriter.println(entry.getKey() + ":");
                for (Object o : value) {
                    printWriter.print(o);
                }
            } else {
//                printWriter.println(entry.getKey() + ":\t" + entry.getValue());
                printWriter.print(entry.getValue());
            }
        }
        printWriter.flush();
    }
}

工具类

public static Map<String, String> csvFile2Map(String filePath) {
        Map<String, String> channelCode2NameMap = new HashMap<String, String>();

        File file = new File(filePath);
        FileChannel fileChannel = null;
        try {
            fileChannel = new RandomAccessFile(filePath, "r").getChannel();
            ByteBuffer byteBuffer = ByteBuffer.allocate(10);
            //使用temp字节数组用于存储不完整的行的内容
            byte[] temp = new byte[0];
            while (fileChannel.read(byteBuffer) != -1) {
                byte[] bs = new byte[byteBuffer.position()];
                byteBuffer.flip();
                byteBuffer.get(bs);
                byteBuffer.clear();
                int startNum = 0;
                //判断是否出现了换行符,注意这要区分LF-\n,CR-\r,CRLF-\r\n,这里判断\n
                boolean isNewLine = false;
                for (int i = 0; i < bs.length; i++) {
                    if (bs[i] == 10) {
                        isNewLine = true;
                        startNum = i;
                    }
                }

                if (isNewLine) {
                    //如果出现了换行符,将temp中的内容与换行符之前的内容拼接
                    byte[] toTemp = new byte[temp.length + startNum];
                    System.arraycopy(temp, 0, toTemp, 0, temp.length);
                    System.arraycopy(bs, 0, toTemp, temp.length, startNum);

                    String line = new String(toTemp).replaceAll("\\s|\\t|\\r|\\n","");
//                    System.out.println(line);
                    String[] lineArr = line.split(",");
                    String code = lineArr[0];
                    String name = lineArr[1];
                    channelCode2NameMap.put(code, name);
                    //将换行符之后的内容(去除换行符)存到temp中
                    temp = new byte[bs.length - startNum - 1];
                    System.arraycopy(bs, startNum + 1, temp, 0, bs.length - startNum - 1);
                    //使用return即为单行读取,不打开即为全部读取
//                return;
                } else {
                    //如果没出现换行符,则将内容保存到temp中
                    byte[] toTemp = new byte[temp.length + bs.length];
                    System.arraycopy(temp, 0, toTemp, 0, temp.length);
                    System.arraycopy(bs, 0, toTemp, temp.length, bs.length);
                    temp = toTemp;
                }

            }
//        if (temp.length > 0) {
//            System.out.println(new String(temp));
//        }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }


        return channelCode2NameMap;
    }

参考文档

感谢作者黄亿华的开源和分享,这里是Webmagic相关地址:

文档:
● 中文: http://webmagic.io/docs/zh/
● English: http://webmagic.io/docs/en

API:
● http://code4craft.github.io/webmagic/docs/en/

源码:
● https://git.oschina.net/flashsword20/webmagic
● https://github.com/code4craft/webmagic

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

猿与禅

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值