webmgic 架构图:
Processor:
public class TvSouProcessor implements PageProcessor {
Map channelCodeMap = NIOUtils.csvFile2Map(TvSouConstant.CHANNELMAP_FILEPATH);
private Site site = Site.me().setCycleRetryTimes(5).setRetryTimes(5).setSleepTime(500).setTimeOut(3 * 60 * 1000)
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
// .setCharset("UTF-8")
@Override
public void process(Page page) {
//TODO 选取Xpath
String dateXpath = "/html/body/div[3]/div[3]/div[3]/div[2]/div[1]/div[2]/span/text()";
String url = page.getUrl().toString();
//频道
String channel_code = url.substring(url.indexOf("epg/") + 4, url.lastIndexOf("/"));
//获取映射的频道
String channel = channelCodeMap.get(channel_code).toString();
//日期
String date = page.getHtml().xpath(dateXpath
).toString();
//TODO 拼接爬取结果
StringBuilder accum = new StringBuilder();
int size = page.getHtml().xpath("/html/body/div[3]/div[3]/div[3]/div[2]/div[3]/ol/li").nodes().size();
for (int i = 1; i <= size; i++) {
//开始时间
String timeXpathStr = "/html/body/div[3]/div[3]/div[3]/div[2]/div[3]/ol/li[" + i + "]/span/text()";
String time = page.getHtml().xpath(timeXpathStr).toString();
//节目
String pgXpathStr = "/html/body/div[3]/div[3]/div[3]/div[2]/div[3]/ol/li[" + i + "]/a/text()";
String pg = page.getHtml().xpath(pgXpathStr).toString();
accum.append(channel).append("\t");
accum.append(date).append("\t");
accum.append(time).append("\t");
accum.append(pg).append("\t");
accum.append("\n");
}
page.putField("", accum.toString());
// String yesterdayStr = LocalDateTime.now().minusDays(1).format(DateTimeFormatter.ofPattern("YYYYMMdd"));
Calendar cal = Calendar.getInstance();
cal.add(Calendar.DATE, -1);
String yesterdayStr = new SimpleDateFormat("YYYYMMdd").format(cal.getTime());
//TODO 深度爬取 不知道为什么只能匹配到一个符合规则的URL 就没有采用这种方法
// System.out.println(page.getHtml().css("html body div.mr div.tv-more-channel div.tv-table.tv-channel-main
// .relative.ov div.sidebar.l div.channel-box.channel-boxs.cd-l-list ul.font-12 li.relative a[href]").smartContent());
// int liLen = 50;
// String urlRegex = "http://tvsou.com/epg/cctv-[\\d]/" + yesterdayStr + "\\?class=yangshi";
// for (int i = 1; i <= liLen; i++) {
// Selectable urlSelect = page.getHtml().xpath
// ("/html/body/div[3]/div[3]/div[3]/div[1]/div[1]/ul/li["+i+"]/a")
// .xpath("//a/@href");
// List<String> requests = urlSelect.all();
// page.addTargetRequests(requests);
// }
// System.out.println(page.getHtml().xpath("/html/body/div[3]/div[3]/div[3]/div[1]/div[1]/ul/li[17]/a").xpath
// ("//a/@href"));
// page.addTargetRequests(page.getHtml().links().regex("http://tvsou.com/epg/cctv-[\\d]/" + yesterdayStr +
// "\\?class=yangshi").all());
}
@Override
public Site getSite() {
return site;
}
}
执行入口
TvSouExecutor:
public class TvSouExecutor {
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
//TODO 读取频道URL编码映射文件
Map channelCodeMap = NIOUtils.csvFile2Map(TvSouConstant.CHANNELMAP_FILEPATH);
Set<String> channelCodeSet = channelCodeMap.keySet();
String[] channelCodeArr = new String[channelCodeSet.size()];
channelCodeSet.toArray(channelCodeArr);
int len = 15;
List<String> urlList = new ArrayList<String>();
//昨天的日期
// String yesterdayStr = LocalDateTime.now().minusDays(1).format(DateTimeFormatter.ofPattern("YYYYMMdd"));
Calendar cal = Calendar.getInstance();
cal.add(Calendar.DATE, -1);
String yesterdayStr = new SimpleDateFormat("YYYYMMdd").format(cal.getTime());
String yesterdayStr2 = new SimpleDateFormat("YYYY-MM-dd").format(cal.getTime());
for (int i = 0; i < channelCodeArr.length; i++) {
if (channelCodeArr[i].contains("cctv")) {
//央视
urlList.add("http://tvsou.com/epg/" + channelCodeArr[i] + "/" + yesterdayStr + "?class=yangshi");
} else {
//省级卫视
String url = "http://tvsou.com/epg/" + channelCodeArr[i] + "/" + yesterdayStr + "?class=weishi";
urlList.add(url);
}
}
String[] urls = urlList.toArray(new String[urlList.size()]);
//downloader
//processor
//scheduler
//selector
//pipeline
Spider.create(new TvSouProcessor())
//可查看所有爬过的URL
// .setScheduler(new FileCacheQueueScheduler("data/cache"))
.addUrl(urls)
.addPipeline(new OneFilePipeline("data/epg"+yesterdayStr2))
.thread(1)
.run();
}
}
此时使用了黄大大的示例代码
OneFilePipeline:
public class OneFilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = LoggerFactory.getLogger(getClass());
private PrintWriter printWriter;
public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException {
this("/data/webmagic/");
}
public OneFilePipeline(String path) throws FileNotFoundException, UnsupportedEncodingException {
setPath(path);
printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path)), "UTF-8"));
}
@Override
public synchronized void process(ResultItems resultItems, Task task) {
// printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
// printWriter.println(entry.getKey() + ":");
for (Object o : value) {
printWriter.print(o);
}
} else {
// printWriter.println(entry.getKey() + ":\t" + entry.getValue());
printWriter.print(entry.getValue());
}
}
printWriter.flush();
}
}
工具类
public static Map<String, String> csvFile2Map(String filePath) {
Map<String, String> channelCode2NameMap = new HashMap<String, String>();
File file = new File(filePath);
FileChannel fileChannel = null;
try {
fileChannel = new RandomAccessFile(filePath, "r").getChannel();
ByteBuffer byteBuffer = ByteBuffer.allocate(10);
//使用temp字节数组用于存储不完整的行的内容
byte[] temp = new byte[0];
while (fileChannel.read(byteBuffer) != -1) {
byte[] bs = new byte[byteBuffer.position()];
byteBuffer.flip();
byteBuffer.get(bs);
byteBuffer.clear();
int startNum = 0;
//判断是否出现了换行符,注意这要区分LF-\n,CR-\r,CRLF-\r\n,这里判断\n
boolean isNewLine = false;
for (int i = 0; i < bs.length; i++) {
if (bs[i] == 10) {
isNewLine = true;
startNum = i;
}
}
if (isNewLine) {
//如果出现了换行符,将temp中的内容与换行符之前的内容拼接
byte[] toTemp = new byte[temp.length + startNum];
System.arraycopy(temp, 0, toTemp, 0, temp.length);
System.arraycopy(bs, 0, toTemp, temp.length, startNum);
String line = new String(toTemp).replaceAll("\\s|\\t|\\r|\\n","");
// System.out.println(line);
String[] lineArr = line.split(",");
String code = lineArr[0];
String name = lineArr[1];
channelCode2NameMap.put(code, name);
//将换行符之后的内容(去除换行符)存到temp中
temp = new byte[bs.length - startNum - 1];
System.arraycopy(bs, startNum + 1, temp, 0, bs.length - startNum - 1);
//使用return即为单行读取,不打开即为全部读取
// return;
} else {
//如果没出现换行符,则将内容保存到temp中
byte[] toTemp = new byte[temp.length + bs.length];
System.arraycopy(temp, 0, toTemp, 0, temp.length);
System.arraycopy(bs, 0, toTemp, temp.length, bs.length);
temp = toTemp;
}
}
// if (temp.length > 0) {
// System.out.println(new String(temp));
// }
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return channelCode2NameMap;
}
参考文档
感谢作者黄亿华的开源和分享,这里是Webmagic相关地址:
文档:
● 中文: http://webmagic.io/docs/zh/
● English: http://webmagic.io/docs/en
API:
● http://code4craft.github.io/webmagic/docs/en/
源码:
● https://git.oschina.net/flashsword20/webmagic
● https://github.com/code4craft/webmagic