1、新建工程
scrapy startproject kuaidaili
2、新建爬虫文件
进入工程目录下
cd ./kuaidaili/kuaidaili
新建爬虫文件
scrapy genspider kuaidailispider "www.kuaidaili.com"
3、配置settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for kuaidaili project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'kuaidaili'
SPIDER_MODULES = ['kuaidaili.spiders']
NEWSPIDER_MODULE = 'kuaidaili.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 默认的管道处理数据 设置了存到文件中
'kuaidaili.pipelines.KuaidailiPipeline': 300,
# 自定义的管道,用于存储到MySQL数据库
'kuaidaili.pipelines.MysqlPipeline': 299,
# 自定义的管道,用于存储到MongoDB数据库,需要时开启即可
# 'kuaidaili.pipelines.MongoDBPipeline': 298,
}
DB_HOST = '127.0.0.1'
DB_PORT = 3306
DB_USER = 'root'
DB_PWD = 'root'
DB_NAME = 'kuaidaili_test'
DB_CHARSET = 'utf8'
# MongoDB存储数据库配置
M_DB_HOST = '127.0.0.1'
M_DB_PORT = 27017
M_DB_NAME = 'kuaidaili_test'
4、配置items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class KuaidailiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 需要的数据现在这里制定好,因为在settings里面已近设置好了,所以可以和pipelines联系在一块处理数据
# ip地址
ip = scrapy.Field()
# 端口
port = scrapy.Field()
5、配置pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from scrapy.utils.project import get_project_settings
import pymysql
class KuaidailiPipeline(object):
# 重写构造方法,在这打开文件
def __init__(self):
# 文件的打开写到这里,仅会执行一次
self.fp = open('kuaidaili.txt', 'w', encoding='utf-8')
def open_spider(self, spider):
pass
# 在这里处理每一个item
def process_item(self, item, spider):
# 将这个对象转化为字典
obj = dict(item)
# print(obj)
# 将obj转化为字符串 然后写入到文件
string = json.dumps(obj, ensure_ascii=False)
self.fp.write(string + '\n')
return item
# 重写这个方法,在关闭spider的时候将文件资源关闭
def close_spider(self, spider):
self.fp.close()
# 存入到mysql中
class MysqlPipeline(object):
"""docstring for MysqlPipeline"""
def __init__(self):
settings = get_project_settings()
self.host = settings['DB_HOST']
self.port = settings['DB_PORT']
self.user = settings['DB_USER']
self.pwd = settings['DB_PWD']
self.name = settings['DB_NAME']
self.charset = settings['DB_CHARSET']
# 链接数据库
self.connect()
def connect(self):
self.conn = pymysql.connect(host=self.host,
port=self.port,
user=self.user,
password=self.pwd,
db=self.name,
charset=self.charset)
self.cursor = self.conn.cursor()
def close_spider(self, spider):
self.conn.close()
self.cursor.close()
def process_item(self, item, spider):
sql = 'insert into ip_list(ip, port) values("%s", "%s")' % (item['ip'], item['port'])
# 执行sql语句
self.cursor.execute(sql)
# 提交之后才会生效
self.conn.commit()
return item
class MongoDBPipeline(object):
"""docstring for MysqlPipeline"""
def __init__(self):
settings = get_project_settings()
self.host = settings['M_DB_HOST']
self.port = settings['M_DB_PORT']
self.name = settings['M_DB_NAME']
# 链接数据库
self.connect()
def connect(self):
self.mongocli = pymongo.MongoClient(host = self.host, port = self.port)
self.dbname = mongocli[self.name]
self.sheetname = self.dbname["ip_list"]
def close_spider(self, spider):
self.mongocli.close()
def process_item(self, item, spider):
data = json.loads(item)
# 将数据插入到sheetname表里
self.sheetname.insert(data)
return item
6、配置kuaidailispider.py
# -*- coding: utf-8 -*-
import scrapy
from kuaidaili.items import KuaidailiItem
import time
class KuaidailispiderSpider(scrapy.Spider):
name = 'kuaidailispider'
allowed_domains = ['www.kuaidaili.com']
# 基础 url
url = 'https://www.kuaidaili.com/free/intr/'
# 起始页面
page = 1
# 起始 url
start_urls = ['https://www.kuaidaili.com/free/intr/1']
def parse(self, response):
print('开始爬取...')
ip_list = response.xpath('//tbody/tr/td[1]/text()').extract()
port_list = response.xpath('//tbody/tr/td[2]/text()').extract()
for i in range(len(ip_list)):
item = KuaidailiItem()
item['ip'] = ip_list[i]
item['port'] = port_list[i]
yield item
time.sleep(1)
self.page += 1
if self.page <= 10:
url = self.url + str(self.page)
print(url)
yield scrapy.Request(url=url,callback=self.parse)
具体项目地址:git@gitee.com:aeasringnar/kuaidaili-scrapy.git