python爬取京东商品信息验证中心,[JavaScript]爬取京东商城上的商品信息和评论爬虫源码 - 代码贴 - BCCN...

最新推荐文章于 2024-03-03 22:30:44 发布

无用的李

最新推荐文章于 2024-03-03 22:30:44 发布

阅读量228

点赞数

文章标签： python爬取京东商品信息验证中心

/*使用javascript编写的爬虫源码，用于爬取京东商城上的商品信息和评论。

代码粘贴到神箭手云爬虫平台(http://www.)上就可以直接跑了，不需要安装编译环境。要爬取其他网站，可以更改源码即可。

代码执行具体步骤请参考：https://ShenJianShou/crawler_samples/blob/master/%E5%A6%82%E4%BD%95%E6%89%A7%E8%A1%8C%E6%A0%B7%E4%BE%8B%E4%BB%A3%E7%A0%81.txt

*/

var keyword = "d3.js";//@input(keyword, 查询关键字, 爬取该关键字搜索出来的京东商品)

var comment_count = 100;//@input(comment_count, 爬取的评论数, 最多爬取多少条评论)

var page_count = comment_count / 10;

keyword = keyword.trim();

var scanUrls = [];

scanUrls.push("http://search.Search?keyword=" + keyword.replace(/ /g, "+") + "&enc=utf-8&scrolling=y&page=200");

var helperUrlRegexes = [];

helperUrlRegexes.push("http://search\\.jd\\.com/Search\\?keyword=" + keyword.replace(/ /g, "\\+").replace(/\./g, "\\.") + "&enc=utf-8&scrolling=y&page=\\d+");

var configs = {

domains: ["search., "item., "club.],

scanUrls: scanUrls,

contentUrlRegexes: ["http://item\\.jd\\.com/\\d+.html"],

helperUrlRegexes: helperUrlRegexes,

interval: 10000,

fields: [

{

// 第一个抽取项

name: "title",

selector: "//div[@id='name']/h1",

required: true

},

{

// 第一个抽取项

name: "productid",

selector: "//div[contains(@class,'fl')]/span[2]",

required: true

},

{

name: "comments",

selector: "//div[@id='comment-pages']/span",

repeated: true,

children: [

{

name: "page",

selector: "//text()"

},

{

name: "comments",

sourceType: SourceType.AttachedUrl,

attachedUrl: "http://club.{$.productid}-s-0-t-3-p-{page}.html",

selectorType: SelectorType.JsonPath,

selector: "$.comments",

repeated: true,

children:[

{

name: "com_content",

selectorType: SelectorType.JsonPath,

selector: "$.content"

},

{

name: "com_nickname",

selectorType: SelectorType.JsonPath,

selector: "$.nickname"

}

]

}

]

}

]

};

configs.afterDownloadPage = function(page, site) {

var matches = /item\.jd\.com\/(\d+)\.html/.exec(page.url);

if (!matches) return page;

var commentUrl = "http://club.+matches[1]+"-s-0-t-3-p-0.html";

var result = site.requestUrl(commentUrl);

var data = JSON.parse(result);

var commentCount = data.productCommentSummary.commentCount;

var pages = commentCount / 10;

if (pages > page_count) pages = page_count;

var pageHtml = "

";

for (var i = 0; i < pages; i++) {

pageHtml += "" + i + "";

}

pageHtml += "

";

var index = page.raw.indexOf("

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python爬取京东商品信息验证中心,[JavaScript]爬取京东商城上的商品信息和评论爬虫源码 - 代码贴 - BCCN...

/*使用javascript编写的爬虫源码，用于爬取京东商城上的商品信息和评论。代码粘贴到神箭手云爬虫平台(http://www.)上就可以直接跑了，不需要安装编译环境。要爬取其他网站，可以更改源码即可。代码执行具体步骤请参考：https://ShenJianShou/crawler_samples/blob/master/%E5%A6%82%E4%BD%95%E6%89%A7%E8%A1%8C%...
复制链接

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。