- 简单写了个爬京东评论的简单网页
- var configs = {
- domains: ["search.jd.com","item.jd.com","club.jd.com"],
- scanUrls: ["http://search.jd.com/Search?keyword=Python&enc=utf-8&qrst=1&rt=1&stop=1&book=y&vt=2&page=1&s=1&click=0"],
- contentUrlRegexes: ["http://item\\.jd\\.com/\\d+.html"],
- helperUrlRegexes: ["http://search\\.jd\\.com/Search\\?keyword=Python&enc=utf-8&qrst=1&rt=1&stop=1&book=y&vt=2&page=\\d+&s=1&click=0"],
- fields: [
- {
- // 第一个抽取项
- name: "title",
- selector: "//div[@id='name']/h1",
- required: true
- },
- {
- // 第一个抽取项
- name: "productid",
- selector: "//div[contains(@class,'fl')]/span[2]",
- required: true
- },
- {
- name: "comments",
- sourceType: SourceType.AttachedUrl,
- attachedUrl: "http://club.jd.com/productpage/p-{productid}-s-0-t-3-p-0.html",
- selectorType: SelectorType.JsonPath,
- selector: "$.comments",
- repeated: true,
- children:[
- {
- name: "com_content",
- selectorType: SelectorType.JsonPath,
- selector: "$.content"
- },
- {
- name: "com_nickname",
- selectorType: SelectorType.JsonPath,
- selector: "$.nickname"
- },
- ]
- }
- ]
- };
- configs.onProcessHelperUrl = function(url, content, site){
- if(!content.indexOf("抱歉,没有找到")){
- var currentPage = parseInt(url.substring(url.indexOf("&page=") + 6));
- if(currentPage == 0){
- currentPage = 1;
- }
- var page = currentPage + 2;
- var nextUrl = url.replace("&page=" + currentPage, "&page=" + page);
- site.addUrl(nextUrl);
- }
- return true;
- };
- var crawler = new Crawler(configs);
- crawler.start();
PS:爬虫脚本是用JS写的
PPS:是在神箭手云爬虫平台上运行的
请教:目前只能爬第一页的,一共是30条评论,请教怎么可以爬第二页的,这个不知道如何处理,有大神知道的嘛
PPS:是在神箭手云爬虫平台上运行的
请教:目前只能爬第一页的,一共是30条评论,请教怎么可以爬第二页的,这个不知道如何处理,有大神知道的嘛