cmd进入要创建项目的目录
scrapy startproject anjuke#创建工程
cd anjuke#切换到工程目录
scrapy genspider anjuke_spider anjuke.com #创建爬虫名为anjuke_spider 并指定爬取的域名
spider脚本(对应我的anjuke_spider.py):
1.不需要从列表页带数据,也就是说只需要得到列表页列表url时:
(列表页如下)
# -*- coding: utf-8 -*-
import scrapy
from anjuke.items import AnjukeItem
import re
class AnjukeSpiderSpider(scrapy.Spider):
print('爬虫启动1111111....................')
name = 'anjuke_spider'
allowed_domains = ['anjuke.com']
start_urls = ['https://chengdu.anjuke.com/tycoon/wenjiang/']
def parse(self, response):
print('爬虫解析11111....................')
#得到列表页各条url
for link in response.xpath("//div[contains(@class, 'jjr-itemmod')]/a/@href").extract():
#每通过循环得到一条url,就callback给parse_detail方法访问,得到详情页内容
yield scrapy.Request(link, callback=self.parse_detail)
# 找到下一个链接,也就是翻页。
next_url = response.xpath('/html/body/div[6]/div[2]/div[3]/div/a[7]/@href').extract_first()
if next_url:
#如果有下一页,callback回parse方法,继续抓取翻页后的列表url
yield scrapy.Request(next_url, callback=self.parse)
def parse_detail(self, response):
#items.py里面的类
item=AnjukeItem()
item['typee'] = response.xpath('/html/body/div[4]/div[2]/div[2]/div[1]/p[1]/text()').extract_first()
item['feee'] =response.xpath('//*[@id="shop-content"]/div[2]/div[2]/div/div[1]/dl[1]/dd/p[1]/a/text()').extract_first()
item['namee'] = response.xpath('/html/body/div[2]/div[2]/div/div[1]/div/a/text()').extract_first()
item['pricee'] = response.xpath('//*[@id="shop-content"]/div[2]/div[2]/div/div[1]/dl[1]/dd/p[2]/span/text()').extract_first()
yield item
2,需要从列表页带数据去详情页时:
# -*- coding: utf-8 -*-
import scrapy
from anjuke.items import AnjukeItem
class AnjukeSpiderSpider(scrapy.Spider):
print('爬虫启动1111111....................')
name = 'anjuke_spider'
allowed_domains = ['anjuke.com']
start_urls = ['https://chengdu.anjuke.com/community/?from=navigation']
def parse(self, response):
print('爬虫解析11111....................')
#定位得到列表页各块所在标签,得到一个列表,里面是想得到的各块内容所在的div
masage_div=response.xpath('//*[@id="list-content"]/div')
#循环各块,得到各块所需到的名字,价格之类的数据
for x in masage_div:
item = AnjukeItem()
item['namee'] = x.xpath('./div[1]/h3/a/text()').extract_first()
item['pricee'] =x.xpath('./div[2]/p[1]/strong/text()').extract_first()
#得到进入详情页的url
detail_url =x.xpath('./div[1]/h3/a/@href').extract_first()
#得到名字,价格和进入详情页的url之后,对详情页发起请求,带着名字和价格callback给parse_detail方法,进一步抓取详情页数据
yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item})
# 找到下一个链接,也就是翻页。
next_url = response.xpath('/html/body/div[5]/div[3]/div[3]/div/a[7]/@href').extract_first()
if next_url:
yield scrapy.Request(next_url, callback=self.parse)
def parse_detail(self, response):
# response.meta得到接收到的meta字典
item = response.meta['item']
item['typee'] = response.xpath('//*[@id="basic-infos-box"]/dl/dd[1]/text()').extract_first()
item['feee'] =response.xpath('//*[@id="basic-infos-box"]/dl/dd[2]/text()').extract_first()
yield item
items脚本:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class AnjukeItem(scrapy.Item):
# define the fields for your item here like:
namee = scrapy.Field()
pricee = scrapy.Field()
typee = scrapy.Field()
feee = scrapy.Field()
piplines.py脚本:
提前建好数据库,数据表
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import logging
import pymysql
from twisted.enterprise import adbapi
import time
import copy
class AnjukePipeline(object):
def process_item(self, item, spider):
con = pymysql.connect(
host='127.0.0.1',
user='用户名',
passwd='密码',
db='数据库名',
charset='utf8',
port=3306)
cue = con.cursor()
print("mysql connect succes")
try:
cue.execute(
"INSERT INTO anjuke_test(namee,typee,pricee,feee) VALUES (%s,%s,%s,%s)",
(item['namee'],
item['typee'],
item['pricee'],item['feee']))
print("Insert success")
except Exception as e:
print("Insert error:", e)
con.rollback()
else:
con.commit()
con.close()
return item
settings.py脚本里,把
ITEM_PIPELINES = {
'anjuke.pipelines.AnjukePipeline': 300,
}
打开
最后,cmd里进入爬虫所在根目录,运行scrapy crawl 爬虫名就可以啦