python中scrapy可以爬取多少数据_python中scrapy框架爬取携程景点数据

本文介绍了如何利用Python的Scrapy框架爬取携程网站上的河南省景点信息,包括景点名称、地址、省市县、描述和图片地址。首先确定起始页面,接着创建Scrapy项目,配置settings.py,中间件,items.py,爬虫文件scenic.py,并实现数据保存到MySQL数据库的pipeline。通过运行start.py启动爬虫。
摘要由CSDN通过智能技术生成

———————————————————————————————

[版权申明:本文系作者原创,转载请注明出处]

文章出处:https://blog.csdn.net/sdksdk0/article/details/82381198

作者:朱培      ID:sdksdk0

——————————————————————————————–

本文使用scrapy框架,python3.6进行爬取,主要获取的是携程上河南省的景点名称,地址,省市县,描述,图片地址信息等。首先通过搜索可以得到河南的网页地址为:http://piao.ctrip.com/dest/u-_ba_d3_c4_cf/s-tickets/P1/,然后以这个页面为起始位置开始爬取。将爬取的数据保存到mysql数据库中。

1、创建scrapy项目

scrapy startproject ctrip

2、创建 spider,首先进入ctrip文件夹

scrapy genspider scenic “ctrip.com”

3、settings.py文件中:

BOT_NAME = 'ctrip'

SPIDER_MODULES = ['ctrip.spiders']

NEWSPIDER_MODULE = 'ctrip.spiders'

ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = {

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'Accept-Language': 'en',

}

DOWNLOADER_MIDDLEWARES = {

'ctrip.middlewares.UserAgentDownloadMiddleware': 543,

}

ITEM_PIPELINES = {

'ctrip.pipelines.DBPipeline': 300,

}

4、middlewares.py中

import random

class UserAgentDownloadMiddleware (object):

USER_AGENTS = [

"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",

"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",

"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",

"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",

"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",

"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",

"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",

"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"

]

def process_request(self,request,spider):

user_agent = random.choice(self.USER_AGENTS)

request.headers['User-Agent'] = user_agent

5、items.py

import scrapy

class ScenicItem(scrapy.Item):

province = scrapy.Field()

city = scrapy.Field()

county = scrapy.Field()

name = scrapy.Field()

scenic_url = scrapy.Field()

image_url = scrapy.Field()

address = scrapy.Field()

descript = scrapy.Field()

code = scrapy.Field()

6、scenic.py

# -*- coding: utf-8 -*-

import scrapy

import re

from ctrip.items import ScenicItem

class ScenicSpider(scrapy.Spider):

name = 'scenic'

allowed_domains = ['ctrip.com']

start_urls = ['http://piao.ctrip.com/dest/u-_ba_d3_c4_cf/s-tickets/P1/']

count = 0

def parse(self, response):

trs = response.xpath("//div[@id='searchResultContainer']//div[@class='searchresult_product04']")

for tr in trs:

ctrip_url = tr.xpath(".//div[1]/a/@href").get()

c1_url = ctrip_url.split("t/t")

scemic_num = c1_url[1].split(".")

scemic_num = scemic_num[0]

scenic_url = ""

image_url = tr.xpath(".//div[1]/a/img/@src").get()

address = tr.xpath(".//div[1]/div[@class='adress']//text()").get().strip()

address = re.sub(r"地址:", "", address)

descript = tr.xpath(".//div[1]/div[@class='exercise']//text()").get().strip()

descript = re.sub(r"特色:", "", descript)

name = tr.xpath(".//div[1]//h2/a/text()").get().strip()

cityinfo=address

province = "河南省"

city = ""

county = ""

if "省" in cityinfo:

matchObj = re.match(r'(.*)[?省](.+?)市(.+?)([县]|[区])', cityinfo, re.M | re.I)

if matchObj:

province = matchObj.group(1) + "省"

city = matchObj.group(2) + "市"

if "县" in cityinfo:

county = matchObj.group(3) + "县"

else:

county = matchObj.group(3) + "区"

else:

matchObj2 = re.match(r'(.*)[?省](.+?)市(.+?)市', cityinfo, re.M | re.I)

matchObj1 = re.match(r'(.*)[?省](.+?)市', cityinfo, re.M | re.I)

if matchObj2:

city = matchObj2.group(2) + "市"

county = matchObj2.group(3) + "市"

elif matchObj1:

city = matchObj1.group(2) + "市"

else:

matchObj1 = re.match(r'(.*)[?省](.+?)([县]|[区])', cityinfo, re.M | re.I)

if matchObj1:

if "县" in cityinfo:

county = matchObj1.group(2) + "县"

else:

county = matchObj1.group(2) + "区"

else:

matchObj = re.match(r'(.+?)市(.+?)([县]|[区])', cityinfo, re.M | re.I)

if matchObj:

city = matchObj.group(1) + "市"

if "县" in cityinfo:

county = matchObj.group(2) + "县"

else:

county = matchObj.group(2) + "区"

else:

matchObj = re.match(r'(.+?)市', cityinfo, re.M | re.I)

if matchObj:

city = matchObj.group(1) + "市"

else:

matchObj = re.match(r'(.+?)县', cityinfo, re.M | re.I)

if matchObj:

county = matchObj.group(1) + "县"

self.count += 1

code = "A" + str(self.count)

item = ScenicItem(name=name,province=province,city=city,county=county,address=address,descript=descript,

scenic_url=scenic_url,image_url=image_url,code=code)

yield item

next_url = response.xpath('//*[@id="searchResultContainer"]/div[11]/a[11]/@href').get()

if next_url:

yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse,meta={})

7、pipelines.py,将数据保存到mysql数据库中

import pymysql

# 用于数据库存储

class DBPipeline(object):

def __init__(self):

# 连接数据库

self.connect = pymysql.connect(

host='localhost',

port=3306,

db='edu_demo',

user='root',

passwd='123456',

charset='utf8',

use_unicode=True)

# 通过cursor执行增删查改

self.cursor = self.connect.cursor();

def process_item(self, item, spider):

try:

# 查重处理

self.cursor.execute(

"""select * from a_scenic where ctrip_url = %s""",

item['scenic_url'])

# 是否有重复数据

repetition = self.cursor.fetchone()

# 重复

if repetition:

pass

else:

# 插入数据

self.cursor.execute(

"""insert into a_scenic(code,province, city, county, name ,description, ctrip_url,image_url,address,type)

value (%s,%s, %s, %s, %s, %s, %s, %s, %s, %s)""",

(item['code'],

item['province'],

item['city'],

item['county'],

item['name'],

item['descript'],

item['scenic_url'],

item['image_url'],

item['address'], '1'))

# 提交sql语句

self.connect.commit()

except Exception as error:

# 出现错误时打印错误日志

print(error)

return item

8、start.py

from scrapy import cmdline

cmdline.execute("scrapy crawl scenic".split())

9、运行start.py即可

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值