我对当当网所分类进行了遍历 ,对分类下的商品内容精心爬取,算是一个简单的爬取,并没有细化分类 爬取所有的商品
下面是爬虫的spider
import scrapy
from pyquery import PyQuery as pq
from dangdang.items import DangdangItem
class SpiderSpider(scrapy.Spider):
name = "spider"
allowed_domians = ['www.dangdang.com']
def start_requests(self):
# start_urls = 'http://category.dangdang.com/cid4009733.html'
start_urls = 'http://category.dangdang.com/?ref=www-0-C'
# start_urls = 'http://book.dangdang.com/01.03.htm?ref=book-01-A' /div[contains(@class,"new_pub_nav_pop")]/div[contains(@class,"left_box")]
yield scrapy.Request(url=start_urls,callback=self.parse,dont_filter=True)
def parse(self,response):
menu_list = response.xpath('//*[contains(@class,"classify_left")]//a')
for menu in menu_list:
uel = menu.xpath('@href').extract_first()
title = menu.xpath('text()').extract_first()
if 'javascript' not in uel:
yield scrapy.Request(url = uel , callback=self.parse_cid,dont_filter=True)
def parse_cid(self,response):
print('打印cid')
try:
item = DangdangItem()
note_list = response.xpath('//div[contains(@id,"search_nature_rg")]/ul/li')
for note in note_list:
name = note.xpath('./p[contains(@class,"name")]/a/text()').extract_first()
price = note.xpath('./p[@class="price"]/span/text()').extract_first()
level = note.xpath('./p[contains(@class,"star")]/a/text()').extract_first()
shop = note.xpath('./p[contains(@class,"link")]/a/text()').extract_first()
if shop:
shop =shop
else:
shop = u'当当自营'
# item = DangdangItem()
for field in item.fields:
item[field] = eval(field)
yield item
next_page = 'http://category.dangdang.com/'+response.xpath('//a[contains(@class,"arrow_r")]/@href').extract_first()
if 'javascript:void(0);' not in next_page:
yield scrapy.Request(url=next_page,callback=self.parse_cid,dont_filter=True)
except:
pass
下面是爬虫的pipline
import MySQLdb
from dangdang.settings import *
class DangdangPipeline(object):
def __init__(self):
self.item_array = []
self.db = MySQLdb.connect(MYSQL_HOST, MYSQL_USER, MYSQL_PASSWD, MYSQL_DBNAME, charset='utf8mb4', use_unicode=True)
self.cursor = self.db.cursor()
self.insert_sql = """
insert into {table_name}(name,price,level,shop)
VALUES (%s, %s, %s, %s)
""".format(table_name=table_name)
def process_item(self, item, spider):
params = (item['name'], item['price'], item['level'], item['shop'])
self.item_array.append(params)
self.cursor.executemany(self.insert_sql, self.item_array)
self.db.commit()
self.item_array = []
return item
最后放图
单机没有跑完 大概跑了10w数据