完整项目见https://github.com/Narutoooooooo/Spider
一、包含item pipelines的较完整爬虫步骤
-
新建爬虫项目(如何创建爬虫项目)
-
在mySpider/spiders/下新建test2.py,代码内容如下
import scrapy import re import json from mySpider.items import CommentItem class Spider(scrapy.Spider): name = "test2" heads = { "Accept": "* / *", "Accept - Encoding": "gzip, deflate, br", "Accept - Language": "zh - CN, zh; q = 0.9", "Connection": "keep - alive", "Host": "club.jd.com", "Referer": "https: // item.jd.com / 100011336064.html", "Sec - Fetch - Dest": "script", "Sec - Fetch - Mode": "no - cors", "Sec - Fetch - Site": "same - site", "User - Agent": "Mozilla / 5.0(Windows NT 10.0; WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 81.0.4044.92 Safari / 537.36", } url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100011336064&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1" def start_requests(self): yield scrapy.Request(url=self.url, headers=self.heads, callback=self.parse) def parse(self, response): # 正则表达式,贪婪匹配 # 匹配()中间的内容 p = re.compile(r'[(](.*)[)]', re.S) # 匹配后返回一个数组 r = re.findall(p, response.text) jsonstr = r[0] # 将json字符串转化成json对象 jsonobj = json.loads(jsonstr) for line in jsonobj["comments"]: item = CommentItem() id = line["id"] nickname = line["nickname"] productColor = line["productColor"] productSize = line["productSize"] # print("%s\t%s\t%s\t%s" % (id, nickname, productColor, productSize)) item['id'] = id item['nickname'] = nickname item['productColor'] = productColor item['productSize'] = productSize # 返回给pipeline处理 yield item
-
修改items.py
import scrapy class CommentItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() id = scrapy.Field() nickname = scrapy.Field() productColor = scrapy.Field() productSize = scrapy.Field()
-
修改pipelines.py
class CommentPipeline(object): # 初始化函数,一般用作建立数据库连接 def __init__(self): pass # 输出处理方法 def process_item(self, item, spider): print(item['id'], item['nickname'])
-
修改mySpider/init.py
from scrapy import cmdline # 指定名字启动爬虫 cmdline.execute("scrapy crawl test2".split())
-
运行结果
13918726044 琅琊焱 13831071018 M***空 14017180543 羊男神 13821262578 tgy19972015 13940728851 xb_sunshine 13831450974 智***林 13820001940 x***b 13800956917 jd_182*****167 13994655256 大***涂 13937885762 jd136884sqc
二、爬虫处理html文件
部分代码说明
-
test3.py
import scrapy class Spider(scrapy.Spider): name = "test3" heads = { } url = "https://item.jd.com/100011351652.html" def start_requests(self): yield scrapy.Request(url=self.url, headers=self.heads, callback=self.parse) def parse(self, response): # / 代表从根节点寻址,// 代表全局查询 # [@class = 'sku-name'] 过滤条件,对标签上面的属性增加过滤条件 # text() 获取标签的文本内容 # xpath 返回的是一个数组 # extract() 返回Unicode字符串 name = response.xpath("//div[@class = 'sku-name']/text()")[0].extract() print(name)
输出结果:
小米10 Pro 双模5G 骁龙865 1亿像素8K电影相机 50倍变焦 50W快充 12GB+512GB 珍珠白 拍照智能游戏手机 -
test4.py
import scrapy from bs4 import BeautifulSoup from lxml import etree import js2xml class Spider(scrapy.Spider): name = "test4" heads = { } url = "https://item.jd.com/100011351652.html" def start_requests(self): yield scrapy.Request(url=self.url, headers=self.heads, callback=self.parse) def parse(self, response): # html --> xml对象 soup = BeautifulSoup(response.text, 'lxml') # 选择script标签 src = soup.select("html head script")[0].string # js代码 --> xml文档对象 src_text = js2xml.parse(src, debug=False) src_tree = js2xml.pretty_print(src_text) # xml文档对象 --> html文档对象 selector = etree.HTML(src_tree) # 使用html xpath查找标签 value = selector.xpath("//property[@name = 'skuId']/number/@value") print(value) for obj in selector.xpath("//property[@name = 'colorSize']/array/object"): # @value 获取标签属性的值 # ./ 从当前标签开始寻址 id = obj.xpath("./property/number/@value")[0] str = ",".join(obj.xpath("./property/string/text()")) print(id) print(str)
输出结果:
[‘100011351676’, ‘100011395166’, ‘100011351652’, ‘100011395162’, ‘100011351674’, ‘100006299205’, ‘100011351650’, ‘100011395138’, ‘100011351678’, ‘100006299195’]
100011351676
官方标配,8GB+256GB,珍珠白
100011395166
套装版,8GB+256GB,珍珠白
100011351652
官方标配,12GB+512GB,珍珠白
100011395162
套装版,12GB+512GB,珍珠白
100011351674
官方标配,8GB+256GB,星空蓝
100006299205
套装版,8GB+256GB,星空蓝
100011351650
官方标配,12GB+256GB,星空蓝
100011395138
套装版,12GB+256GB,星空蓝
100011351678
官方标配,12GB+512GB,星空蓝
100006299195
套装版,12GB+512GB,星空蓝注:import js2xml时需要提前下载这个包
命令:pip install js2xml