taobao.py主爬取程序
# -*- coding: utf-8 -*-
import scrapy
import json
import re
from ..items import TaobaoItem
class TaobaoSpider(scrapy.Spider):
name = 'taobao'
allowed_domains = ['taobao.com']
start_urls = ['https://s.taobao.com']
#https://s.taobao.com/search?q=%E4%B8%89%E6%98%9F&s=88
def parse(self,response):
for i in range(2, 11):
i = i * 44
#q=后面是搜索的商品名称,本爬虫用三星做实例
next_url = 'https://s.taobao.com/search?q=%E4%B8%89%E6%98%9F&s={}'.format(i)
yield scrapy.Request(next_url,callback=self.parse_xq)
def parse_xq(self, response):
item = TaobaoItem()
# with open('taobao.html','wb') as f:
# f.write(response.body)
html_str = response.text
# print(html_str)
titles = re.findall(r'"raw_ti