1.cmd命令行下创建crwal自动爬虫:
scrapy startproject 项目名
scrapy gensipder -t crawl 爬虫名 网站域名
2.代码
2.1 在item.py中初始化相关容器Id=scrapy.Field()
title=scrapy.Field()
price=scrapy.Field()
link=scrapy.Field()
detail=scrapy.Field()
#评价数量comment=scrapy.Field()
2.2 爬虫代码
# -*- coding: utf-8 -*-importre
importurllib
importscrapy
fromscrapy.linkextractors importLinkExtractor
fromscrapy.spiders importCrawlSpider,Rule
fromscrapy.http importRequest
fromlearning.JD.JD.items importJdItem
classJdspiderSpider(CrawlSpider):
name = 'jdSpider'allowed_domains = ['jd.com']
# start_urls = ['https://www.jd.com/']defstart_requests(self):
hearder=hearder={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"}
yieldRequest('https://www.jd.com/',headers=hearder,)
rules = (
#教案给出来的是不设置过滤规则,将所有的网页都爬取下来后再利用正则表达式判断是不是商品展示页面Rule(LinkExtractor(allow=''),callback='parse_item',follow=True),#自己试验发现所有商品页url中都有item关键字# Rule(LinkExtractor(allow='item'), callback='parse_item', follow=True),)
defparse_item(self,response):
try:
item = JdItem()
#获取当前正在爬取的页面thisUrl=response.url
print("######"+thisUrl)
pat='item.jd.com/(.*?).html'#判断当前正在爬取的页面是不是商品页面,及商品页面url的格式是否符合y=re.search(pat,thisUrl)
if(y):
thisId=re.compile(pat).findall(thisUrl)[0]
#这些是直接能从网页当中得到的信息item["Id"]=thisId
print("正在爬取的商品id为:"+thisId)
item["title"]= response.xpath("//html/head/title/text()").extract()
print("当前商品标题为:"+item["title"][0])
item["link"]=response.xpath('//html/head/link[@rel="canonical"]/@href').extract()
print("当前商品连接为:"+item["link"][0])
item["detail"] = response.xpath('//div[@class="p-parameter"]//li//text()').extract()
print("当前商品细节为:"+item["detail"][0])
#这些是必须要通过抓包而分析才能获取到的信息#根据商品id拼接得到将要爬取的urlpriceUrl="https://c0.3.cn/stock?skuId="+thisId+"&area=22_1930_49322_0&venderId=1000004259&buyNum=1&choseSuitSkuIds=&cat=9987,653,655&extraParam={%22originid%22:%221%22}&fqsp=0&pdpin=jd_6401fe2b172db&pduid=15661949259061415596451&ch=1&callback=jQuery1517076"commentUrl="https://club.jd.com/comment/productCommentSummaries.action?referenceIds="+thisId+"&callback=jQuery4229677&_=1567588862109"#根据拼接的url得到网页中的数据(json数据),再通过正则表达式进行提取priceData=urllib.request.urlopen(priceUrl).read().decode("utf-8","ignore")
commentData=urllib.request.urlopen(commentUrl).read().decode("utf-8","ignore")
# print("commentData为:"+commentData)patP='"p":"(.*?)"'patC='"CommentCount":(.*?),'item["price"]= re.compile(patP).findall(priceData)
print("当前商品价格为:"+item["price"][0])
item["comment"]= re.compile(patC).findall(commentData)
print("当前商品评论为:"+item["comment"][0])
if(len(item["Id"]) andlen(item["title"])andlen(item["link"]) andlen(item["detail"]) andlen(item["price"]) andlen(item["comment"])==0):
print("有商品页的所有信息都无法捕获")
if(len(item["Id"]) orlen(item["title"]) orlen(item["link"]) orlen(item["detail"]) orlen(item["price"]) orlen(item["comment"])==0):
print("该商品的部分信息没有爬取到")
else:
passexcept:
print("爬虫出错")
returnitem
2.3数据库操作,在pipilines.py中完成
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.htmlimportpymysql
classJdPipeline(object):
defprocess_item(self,item,spider):
con=pymysql.connect(host="127.0.0.1",user="root",passwd="root",db="python")
fori inrange(0,len(item["Id"])):
# print("$$@#$@$@#@$"+str(item["detail"]))#detail是以列表的形式存储的,这种方式智能插入列表第一个元素,不能插入所有元素# sql="insert into jd(goodsId,title,link,detial,price,commentCount) values ('%s','%s','%s','%s','%s','%s')" %(# item["Id"],item["title"],item["link"],detail,item["price"],item["comment"]# )#将detail中的数据转化为字符串的形式进行存储,再写入数据库中,并且不加下标,可以存入所有的元素detail=" ".join(item["detail"])
sql="insert into jd(goodsId,title,link,detial,price,commentCount) values ('%s','%s','%s','%s','%s','%s')"%(
item["Id"],item["title"],item["link"],detail,item["price"],item["comment"])
print("sql为:"+sql)
con.query(sql)
con.close()
returnitem