1、创建Scrapy项目
scrapy startproject PosClient
2.进入项目目录,使用命令genspider创建Spider
scrapy genspider posclient XXXX.com
3、定义要抓取的数据(处理items.py文件)
# -*- coding: utf-8 -*-
import scrapy
class PosclientItem(scrapy.Item):
# 序号
number_list= scrapy.Field()
# 客户手机号
client_phone = scrapy.Field()
# 客户姓名
client_name = scrapy.Field()
# 客户地址
client_add = scrapy.Field()
# 注册时间
client_date = scrapy.Field()
# 采购金额
client_sale = scrapy.Field()
# 采购次数
client_sale_num = scrapy.Field()
# 种植面积
client_area = scrapy.Field()
4、编写提取item数据的Spider(在spiders文件夹下:posclient.py)
# -*- coding: utf-8 -*-
import scrapy
from PosClient.items import PosclientItem
class PosclientSpider(scrapy.Spider):
name = 'posclient'
allowed_domains = ['XXXX.com']
# 登陆界面网址
login_page = 'https://pos.XXXX.com/login.html'
offset = 1
# 登陆后需要爬取的页面网址前半部分
url = 'https://pos.XXXX.com/client/p='
# 拼接爬取的页面网址
start_urls = [url + str(offset)]
username = input("请输入账号:")
password = input("请输入密码:")
# 名字不可更改
def start_requests(self):
yield scrapy.Request(url=self.login_page, callback=self.login)
# 登陆,处理form表单
def login(self, response):
yield scrapy.FormRequest.from_response(
response,
formdata={"j_username": self.username, "j_password": self.password},
callback=self.parse_page
)
# 获取登录成功状态,访问需要登录后才能访问的页面
def parse_page(self, response):
if "loginerror" in response.body.decode('utf-8'):
print("登录失败,错误的手机号码或密码!")
if "</span>首页" in response.body.decode('utf-8'):
print("欢迎您'%s',成功登陆POS管理系统!" % self.username)
# 登录成功后获取商品列表页,并回调parse()函数处理数据
full_url = self.url + str(self.offset)
yield scrapy.Request(full_url, callback=self.parse)
def parse(self, response):
# 获取下一页的链接地址
next_url_list = response.xpath('//div[@class="dataTables_paginate paging_full_numbers"]/span/span/a/@href').extract()
for each in response.xpath('//div[@class="dataTables_wrapper"]'):
item = PosclientItem()
# 序号,首尾有空格
number_list = each.xpath('.//td[1]/text()').extract()
# 客户手机号
client_phone = each.xpath('.//td[2]/a[1]/text()').extract()
# 客户姓名
client_name = each.xpath('.//td[2]/a[2]/text()').extract()
# 客户地址
client_add = each.xpath('.//td[3]/a/text()').extract()
# 注册时间
client_date = each.xpath('.//tbody//td[4]/a/text()').extract()
# 采购金额,首尾有空格
client_sale = each.xpath('.//tbody//td[5]/a/text()').extract()
# 采购次数
client_sale_num = each.xpath('.//tbody//td[6]/a/text()').extract()
# 种植面积,首尾有空格
client_area = each.xpath('.//tbody//td[7]/text()').extract()
for i in range(len(client_phone)):
item['number_list'] = number_list[i].strip()
item['client_phone'] = client_phone[i].strip()
item['client_name'] = client_name[i].strip()
item['client_add'] = client_add[i].strip()
# 日期结尾有个.0去掉,2017-11-10 11:04:40.0
item['client_date'] = client_date[i].strip()[:-2]
item['client_sale'] = client_sale[i].strip()
item['client_sale_num'] = client_sale_num[i].strip()
item['client_area'] = client_area[i].strip()
yield item
# 处理下一页
for url in next_url_list:
full_url = 'https://pos.XXXX.com/client.html'+ str(url)
yield scrapy.Request(url=full_url,callback=self.parse)
5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)
# -*- coding: utf-8 -*-
import json
from openpyxl import Workbook
# 转码操作,继承json.JSONEncoder的子类
class MyEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, bytes):
return str(o, encoding='utf-8')
return json.JSONEncoder.default(self, o)
class PosclientPipeline(object):
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
# 创建表头
self.ws.append(['序号','客户手机号','客户姓名','客户地址','注册时间','采购金额','采购次数','种植面积'])
def process_item(self, item, spider):
text = [item['number_list'], item['client_phone'], item['client_name'], item['client_add'],
item['client_date'], item['client_sale'], item['client_sale_num'],item['client_area']]
self.ws.append(text)
return item
def close_spider(self,spider):
self.wb.save('pos_client.xlsx')
print("数据处理完毕,谢谢使用!")
6.配置settings文件(settings.py)
# Obey robots.txt rules,具体含义参照:https://blog.csdn.net/z564359805/article/details/80691677
ROBOTSTXT_OBEY = False
# 下载延迟
DOWNLOAD_DELAY = 2
# Override the default request headers:添加User-Agent信息
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
# Configure item pipelines去掉下面注释,打开管道文件
ITEM_PIPELINES = {
'PosClient.pipelines.PosclientPipeline': 300,
}
# 还可以将日志存到本地文件中(可选添加设置)
LOG_FILE = "stats.log"
LOG_LEVEL = "DEBUG"
# 包含打印信息也一起写进日志里
LOG_STDOUT = True
7.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:
scrapy crawl posclient