python用scrapy爬网站_Python下使用Scrapy爬取网页内容的实例

本文介绍了如何使用Python的Scrapy框架进行网页爬虫开发,包括安装Python、解决依赖问题,以及编写爬虫代码和Pipeline处理爬取的数据。示例代码展示了从nvsheng.com抓取情感生活内容,包括标题、正文和图片,并模拟上传图片到指定接口。
摘要由CSDN通过智能技术生成

上周用了一周的时间学习了Python和Scrapy,实现了从0到1完整的网页爬虫实现。研究的时候很痛苦,但是很享受,做技术的嘛。

首先,安装Python,坑太多了,一个个爬。由于我是windows环境,没钱买mac, 在安装的时候遇到各种各样的问题,确实各种各样的依赖。

安装教程不再赘述。如果在安装的过程中遇到 ERROR:需要windows c/c++问题,一般是由于缺少windows开发编译环境,晚上大多数教程是安装一个VisualStudio,太不靠谱了,事实上只要安装一个WindowsSDK就可以了。

下面贴上我的爬虫代码:

爬虫主程序:

# -*- coding: utf-8 -*-

import scrapy

from scrapy.http import Request

from zjf.FsmzItems import FsmzItem

from scrapy.selector import Selector

# 圈圈:情感生活

class MySpider(scrapy.Spider):

#爬虫名

name = "MySpider"

#设定域名

allowed_domains = ["nvsheng.com"]

#爬取地址

start_urls = []

#flag

x = 0

#爬取方法

def parse(self, response):

item = FsmzItem()

sel = Selector(response)

item['title'] = sel.xpath('//h1/text()').extract()

item['text'] = sel.xpath('//*[@class="content"]/p/text()').extract()

item['imags'] = sel.xpath('//div[@id="content"]/p/a/img/@src|//div[@id="content"]/p/img/@src').extract()

if MySpider.x == 0:

page_list = MySpider.getUrl(self,response)

for page_single in page_list:

yield Request(page_single)

MySpider.x += 1

yield item

#init: 动态传入参数

#命令行传参写法: scrapy crawl MySpider -a start_url="http://some_url"

def __init__(self,*args,**kwargs):

super(MySpider,self).__init__(*args,**kwargs)

self.start_urls = [kwargs.get('start_url')]

def getUrl(self, response):

url_list = []

select = Selector(response)

page_list_tmp = select.xpath('//div[@class="viewnewpages"]/a[not(@class="next")]/@href').extract()

for page_tmp in page_list_tmp:

if page_tmp not in url_list:

url_list.append("http://www.nvsheng.com/emotion/px/" + page_tmp)

return url_list

PipeLines类

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from zjf import settings

import json,os,re,random

import urllib.request

import requests, json

from requests_toolbelt.multipart.encoder import MultipartEncoder

class MyPipeline(object):

flag = 1

post_title = ''

post_text = []

post_text_imageUrl_list = []

cs = []

user_id= ''

def __init__(self):

MyPipeline.user_id = MyPipeline.getRandomUser('37619,18441390,18441391')

#process the data

def process_item(self, item, spider):

#获取随机user_id,模拟发帖

user_id = MyPipeline.user_id

#获取正文text_str_tmp

text = item['text']

text_str_tmp = ""

for str in text:

text_str_tmp = text_str_tmp + str

# print(text_str_tmp)

#获取标题

if MyPipeline.flag == 1:

title = item['title']

MyPipeline.post_title = MyPipeline.post_title + title[0]

#保存并上传图片

text_insert_pic = ''

text_insert_pic_w = ''

text_insert_pic_h = ''

for imag_url in item['imags']:

img_name = imag_url.replace('/','').replace('.','').replace('|','').replace(':','')

pic_dir = settings.IMAGES_STORE + '%s.jpg' %(img_name)

urllib.request.urlretrieve(imag_url,pic_dir)

#图片上传,返回json

upload_img_result = MyPipeline.uploadImage(pic_dir,'image/jpeg')

#获取json中保存图片路径

text_insert_pic = upload_img_result['result']['image_url']

text_insert_pic_w = upload_img_result['result']['w']

text_insert_pic_h = upload_img_result['result']['h']

#拼接json

if MyPipeline.flag == 1:

cs_json = {"c":text_str_tmp,"i":"","w":text_insert_pic_w,"h":text_insert_pic_h}

else:

cs_json = {"c":text_str_tmp,"i":text_insert_pic,"w":text_insert_pic_w,"h":text_insert_pic_h}

MyPipeline.cs.append(cs_json)

MyPipeline.flag += 1

return item

#spider开启时被调用

def open_spider(self,spider):

pass

#sipder 关闭时被调用

def close_spider(self,spider):

strcs = json.dumps(MyPipeline.cs)

jsonData = {"apisign":"99ea3eda4b45549162c4a741d58baa60","user_id":MyPipeline.user_id,"gid":30,"t":MyPipeline.post_title,"cs":strcs}

MyPipeline.uploadPost(jsonData)

#上传图片

def uploadImage(img_path,content_type):

"uploadImage functions"

#UPLOAD_IMG_URL = "http://api.qa.douguo.net/robot/uploadpostimage"

UPLOAD_IMG_URL = "http://api.douguo.net/robot/uploadpostimage"

# 传图片

#imgPath = 'D:\pics\http___img_nvsheng_com_uploads_allimg_170119_18-1f1191g440_jpg.jpg'

m = MultipartEncoder(

# fields={'user_id': '192323',

# 'images': ('filename', open(imgPath, 'rb'), 'image/JPEG')}

fields={'user_id': MyPipeline.user_id,

'apisign':'99ea3eda4b45549162c4a741d58baa60',

'image': ('filename', open(img_path , 'rb'),'image/jpeg')}

)

r = requests.post(UPLOAD_IMG_URL,data=m,headers={'Content-Type': m.content_type})

return r.json()

def uploadPost(jsonData):

CREATE_POST_URL = http://api.douguo.net/robot/uploadimagespost

reqPost = requests.post(CREATE_POST_URL,data=jsonData)

def getRandomUser(userStr):

user_list = []

user_chooesd = ''

for user_id in str(userStr).split(','):

user_list.append(user_id)

userId_idx = random.randint(1,len(user_list))

user_chooesd = user_list[userId_idx-1]

return user_chooesd

字段保存Items类

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

#

# See documentation in:

# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class FsmzItem(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

title = scrapy.Field()

#tutor = scrapy.Field()

#strongText = scrapy.Field()

text = scrapy.Field()

imags = scrapy.Field()

在命令行里键入

scrapy crawl MySpider -a start_url=www.aaa.com

这样就可以爬取aaa.com下的内容了

以上这篇Python下使用Scrapy爬取网页内容的实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持我们。

本文标题: Python下使用Scrapy爬取网页内容的实例

本文地址: http://www.cppcns.com/jiaoben/python/228607.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值