python scrapy库_python爬虫库scrapy简单使用实例详解

最新推荐文章于 2024-09-19 21:10:27 发布

weixin_39767887

最新推荐文章于 2024-09-19 21:10:27 发布

阅读量132

点赞数

文章标签： python scrapy库

最近因为项目需求，需要写个爬虫爬取一些题库。在这之前爬虫我都是用node或者php写的。一直听说python写爬虫有一手，便入手了python的爬虫框架scrapy.

下面简单的介绍一下scrapy的目录结构与使用：

首先我们得安装scrapy框架

pip install scrapy

接着使用scrapy命令创建一个爬虫项目：

scrapy startproject questions

相关文件简介：

scrapy.cfg: 项目的配置文件

questions/: 该项目的python模块。之后您将在此加入代码。

questions/items.py: 项目中的item文件.

questions/pipelines.py: 项目中的pipelines文件.

questions/settings.py: 项目的设置文件.

questions/spiders/: 放置spider代码的目录.

questions/spiders/xueersi.py: 实现爬虫的主体代码.

xueersi.py 爬虫主体

# -*- coding: utf-8 -*-

import scrapy

import time

import numpy

import re

from questions.items import QuestionsItem

class xueersiSpider(scrapy.Spider):

name = "xueersi" # 爬虫名字

allowed_domains = ["tiku.xueersi.com"] # 目标的域名

# 爬取的目标地址

start_urls = [

"http://tiku.xueersi.com/shiti/list_1_1_0_0_4_0_1",

"http://tiku.xueersi.com/shiti/list_1_2_0_0_4_0_1",

"http://tiku.xueersi.com/shiti/list_1_3_0_0_4_0_1",

]

levels = ['偏易','中档','偏难']

subjects = ['英语','语文','数学']

# 爬虫开始的时候，自动调用该方法，如果该方法不存在会自动调用parse方法

# def start_requests(self):

# yield scrapy.Request('http://tiku.xueersi.com/shiti/list_1_2_0_0_4_0_39',callback=self.getquestion)

# start_requests方法不存在时，parse方法自动被调用

def parse(self, response):

# xpath的选择器语法不多介绍，可以直接查看官方文档

arr = response.xpath("//ul[@class='pagination']/li/a/text()").extract()

total_page = arr[3]

# 获取分页

for index in range(int(total_page)):

yield scrapy.Request(response.url.replace('_0_0_4_0_1',"_0_0_4_0_"+str(index)),callback=self.getquestion) # 发出新的请求，获取每个分页所有题目

# 获取题目

def getquestion(self,response):

for res in response.xpath('//div[@class="main-wrap"]/ul[@class="items"]/li'):

item = QuestionsItem() # 实例化Item类

# 获取问题

questions = res.xpath('./div[@class="content-area"]').re(r'

?([\s\S]+?)

if len(questions):

# 获取题目

question = questions[0].strip()

item['source'] = question

dr = re.compile(r']+>',re.S)

question = dr.sub('',question)

content = res.extract()

item['content'] = question

# 获取课目

subject = re.findall(ur'http:\/\/tiku\.xueersi\.com\/shiti\/list_1_(\d+)',response.url)

item['subject'] = self.subjects[int(subject[0])-1]

# 获取难度等级

levels = res.xpath('//div[@class="info"]').re(ur'难度：([\s\S]+?)

item['level'] = self.levels.index(levels[0])+1

# 获取选项

options = re.findall(ur'[A-D][\.．]([\s\S]+?)

item['options'] = options

if len(options):

url = res.xpath('./div[@class="info"]/a/@href').extract()[0]

request = scrapy.Request(url,callback=self.getanswer)

request.meta['item'] = item # 缓存item数据，传递给下一个请求

yield request

#for option in options:

# 获取答案

def getanswer(self,response):

res = response.xpath('//div[@class="part"]').re(ur'

([\s\S]+?)')

con = re.findall(ur'([\s\S]+?)
[\s\S]+?([A-D])',res[0]) # 获取含有解析的答案

if con:

answer = con[0][1]

analysis = con[0][0] # 获取解析

else:

answer = res[0]

analysis = ''

if answer:

item = response.meta['item'] # 获取item

item['answer'] = answer.strip()

item['analysis'] = analysis.strip()

item['answer_url'] = response.url

yield item # 返回item,输出管道(pipelines.py)会自动接收该数据

items.py 数据结构定义:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class QuestionsItem(scrapy.Item):

content = scrapy.Field()

subject = scrapy.Field()

level = scrapy.Field()

answer = scrapy.Field()

options = scrapy.Field()

analysis = scrapy.Field()

source = scrapy.Field()

answer_url = scrapy.Field()

pass

pipelines.py 输出管道(本例子输出的数据写入本地数据库)：

# -*- coding: utf-8 -*-

# Define your item pipelines here

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql

import md5

class QuestionsPipeline(object):

def __init__(self):

# 建立数据库连接

self.connect = pymysql.connect('localhost','root','','question',use_unicode=True,charset='utf8')

# 获取游标

self.cursor = self.connect.cursor()

print("connecting mysql success!")

self.answer = ['A','B','C','D']

def process_item(self, item, spider):

content = pymysql.escape_string(item['content'])

# 获取题目hash值，使用该字段过滤重复的题目

m1 = md5.new()

m1.update(content)

hash = m1.hexdigest()

selectstr = "select id from question where hash='%s'"%(hash)

self.cursor.execute(selectstr)

res = self.cursor.fetchone()

# 过滤相同的题目

if not res:

# 插入题目

sqlstr = "insert into question(content,source,subject,level,answer,analysis,hash,answer_url) VALUES('%s','%s','%s','%s','%s','%s','%s','%s')"%(content,pymysql.escape_string(item['source']),item['subject'],item['level'],item['answer'],pymysql.escape_string(item['analysis']),hash,item['answer_url'])

self.cursor.execute(sqlstr)

qid = self.cursor.lastrowid

# 插入选项

for index in range(len(item['options'])):

option = item['options'][index]

answer = self.answer.index(item['answer'])

if answer==index:

ans = '2'

else:

ans = '1'

sqlstr = "insert into options(content,qid,answer) VALUES('%s','%s','%s')"%(pymysql.escape_string(option[0]),qid,ans)

self.cursor.execute(sqlstr)