spider
import scrapy
from dingdian.items import DingdianItem
class A123wxCcSpider(scrapy.Spider):
name = '123wx_cc'
start_urls = ['https://www.123wx.cc/']
def parse(self, response):
types = response.xpath('//ul/li/a/@href').extract()[1:8]
for one_page_url in map(lambda x:'{}{}'.format('https://www.123wx.cc',x),types):
yield scrapy.Request(one_page_url,self.show_info)
def show_info(self,response):
base = response.xpath('//div[@class="l bd"]/ul/li')
item = DingdianItem()
for i in base:
book_name = i.xpath('span[2]/a/text()').extract_first()
book_author = i.xpath('span[@class="s4"]/text()').extract_first('暂无')
item['book_name'] = book_name
item['book_author'] = book_author
yield item
setting
ITEM_PIPELINES = {
'dingdian.pipelines.DingdianPipeline': 300,
}
MYSQL_USERNAME = 'root'
MYSQL_PWD = '123456'
MYSQL_DB = 'xiaoshuo'
items
import scrapy
class DingdianItem(scrapy.Item):
book_name = scrapy.Field()
book_author = scrapy.Field()
pipelines
from itemadapter import ItemAdapter
import pymysql
class DingdianPipeline:
def __init__(self,username,password,db):
self.username = username
self.password = password
self.db = db
@classmethod
def from_crawler(cls, crawler):
return cls(username=crawler.settings.get('MYSQL_USERNAME'),password=crawler.settings.get('MYSQL_PWD'),db=crawler.settings.get('MYSQL_DB'))
def open_spider(self, spider):
self.client = pymysql.connect(user=self.username,password=self.password,db=self.db)
self.cursor = self.client.cursor()
def process_item(self, item, spider):
book_name = item['book_name']
book_author = item['book_author']
sql = 'insert into tests(book_name,book_author) values ("{}","{}")'.format(book_name,book_author)
self.cursor.execute(sql)
self.client.commit()
return item
def close_spider(self, spider):
self.client.close()
京东
spider
import scrapy
from lxml import etree
class JdComSpider(scrapy.Spider):
name = 'jd_com'
def start_requests(self):
s = 26
url_lists = []
for page in range(1, 7):
if page == 1:
url = 'https://search.jd.com/s_new.php?keyword=%E5%A4%A7%E5%9C%B0%E7%93%9C&qrst=1&' \
'wq=%E5%A4%A7%E5%9C%B0%E7%93%9C&stock=1&pvid=3b8d62d7e2294365911b56aeb0653d3d&' \
'page={}&s={}&click=0'.format(1,1)
url_lists.append(url)
else:
url = 'https://search.jd.com/s_new.php?keyword=%E5%A4%A7%E5%9C%B0%E7%93%9C&qrst=1&' \
'wq=%E5%A4%A7%E5%9C%B0%E7%93%9C&stock=1&pvid=3b8d62d7e2294365911b56aeb0653d3d&' \
'page={}&s={}&click=0'.format(page,s)
url_lists.append(url)
s+=26
for url in url_lists:
yield scrapy.Request(url,self.goods)
def goods(self,response):
response = response.text
prices = etree.HTML(response).xpath('//div[@class="p-price"]/strong/i/text()')
intros = etree.HTML(response).xpath('//div[@class="p-name p-name-type-2"]/a')
intros_list = []
for i in intros:
intro = ''.join(i.xpath('em/text()')).strip()
intros_list.append(intro)
store_names = etree.HTML(response).xpath('//div[@class="p-shop"]/span/a/text()')
for i in range(len(prices)):
print(prices[i], intros_list[i], store_names[i])
setting
DEFAULT_REQUEST_HEADERS = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36x-requested-with: XMLHttpRequest',
'referer': 'https://search.jd.com/Search?keyword=%E5%A4%A7%E5%9C%B0%E7%93%9C&qrst=1&wq=%E5%A4%A7%E5%9C%B0%E7%93%9C&stock=1&pvid=1bf8e67af6b74013a464ee84079faf98&page=1&s=1&click=0'
}