记录一次成功爬虫经验
创建爬虫项目和爬虫文件
爬虫文件的代码
import scrapy
from dangdang.items import DangdangItem
from scrapy.http import Request
class DdSpider(scrapy.Spider):
name = 'dd'
allowed_domains = ['dangdang.com']
start_urls = ['http://search.dangdang.com/?key=%C1%AC%D2%C2%C8%B9&act=input&page_index=1']
def parse(self, response):
item = DangdangItem()
item["title"] = response.xpath('//p[@class="name"]/a/@title').extract()
item["link"] = response.xpath('//p[@class="name"]/a/@href').extract()
item["comment"] = response.xpath('//p[@class="star"]/a/text()').extract()
yield item
for i in range(90, 92):
url = 'http://search.dangdang.com/?key=%C1%AC%D2%C2%C8%B9&act=input&page_index=' + str(i)
yield Request(url,callback=self.parse)
items.py
import scrapy
class DangdangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()#标题
link = scrapy.Field()#链接
comment = scrapy.Field()#评论数
pipe;ones.py
import pymysql
class DangdangPipeline(object):
def process_item(self, item, spider):
conn = pymysql.connect(host="127.0.0.1",user="root",passwd="root",db="dd")
for i in range(0, len(item["title"])):
title = item["title"][i]
link = item["link"][i]
comment = item["comment"][i]
# print(title + ":" + link + ":" + comment)
sql = "insert into goods(title,link,comment) values('" + title + "','" + link + "','" + comment +"')"#sql语句
print(sql)
try:
conn.query(sql)#执行sql语句
except Exception as err:
print(err)
conn.close()
return item