环境:centos6 + python3
安装:pip3 install scrapy
报错:src/twisted/test/raiser.c:4:20: error: Python.h: No such file or directory
src/twisted/test/raiser.c:6:6: error: #error Python headers needed to compile C extensions, please install development version of Python.
error: command 'gcc' failed with exit status 1
解决:需要安装Python的头文件和静态库包(python-devel)
yum search python3 | grep devel #搜索python3下的devel
yum install -y python34-devel.x86_64#安装python34-devel.x86_64
cd myscrapy/myscrapy/spiders
安装:pip3 install scrapy
报错:src/twisted/test/raiser.c:4:20: error: Python.h: No such file or directory
src/twisted/test/raiser.c:6:6: error: #error Python headers needed to compile C extensions, please install development version of Python.
error: command 'gcc' failed with exit status 1
解决:需要安装Python的头文件和静态库包(python-devel)
yum search python3 | grep devel #搜索python3下的devel
yum install -y python34-devel.x86_64#安装python34-devel.x86_64
pip3 install scrapy#成功
1、创建项目
cd /home/chaoge/mypython/crawler/
scrapy startproject myscrapy
vi items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class MyscrapyItem(scrapy.Item):
# define the fields for your item here like:
#职位名称
positionName = scrapy.Field()
#职位连接
positionLink = scrapy.Field()
#职位类型
positionType = scrapy.Field()
#招聘人数
peopleNum = scrapy.Field()
#工作地点
workLocation = scrapy.Field()
#发布时间
publishTime = scrapy.Field()
vi pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class MyscrapyPipeline(object):
def __init__(self):
self.filename = open("tencent.json","wb")
def process_item(self, item, spider):
text = json.dumps(dict(item),ensure_ascii=False)+"\n"
self.filename.write(text.encode("utf-8"))
return item
def close_spider(self,spider):
self.filename.close()
vi settings.py
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'myscrapy.middlewares.MyscrapySpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'myscrapy.middlewares.MyscrapyDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'myscrapy.pipelines.MyscrapyPipeline': 300,
}
2、创建基础类
cd myscrapy/myscrapy/spiders
scrapy genspider tencent "tencent.com"
vi tencent.py
# -*- coding: utf-8 -*-
import scrapy
from myscrapy.items import MyscrapyItem
class TencentSpider(scrapy.Spider):
name = 'tencent'
allowed_domains = ['tencent.com']
url = "http://hr.tencent.com/position.php?&start="
offset=0
#start_urls = ['http://tencent.com/']
start_urls = [url+str(offset)]
def parse(self, response):
for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"):
#初始化模型对象
item = MyscrapyItem()
#职位名称
item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]
#职位连接
item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0]
#职位类型
item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
#招聘人数
item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]
#工作地点
item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
#发布时间
item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]
#将数据交给管道文件处理
yield item
if self.offset < 50:
self.offset += 10
#将请求重新发送给调度器,入队列,出队列,交给下载器下载
yield scrapy.Request(self.url + str(self.offset),callback = self.parse)
else:
print("end.")
3、执行
scrapy crawl tencent
模拟登陆
# -*- coding: utf-8 -*-
import scrapy
class RenrenspiderSpider(scrapy.Spider):
name = 'renrenspider'
allowed_domains = ['renren.com']
#start_urls = ['http://renren.com/']
def start_requests(self):
url = 'http://www.renren.com/PLogin.do'
yield scrapy.FormRequest(url = url, formdata={"email":"XXXX@163.com","password":"XXXXXX"},callback=self.parse_page)
def parse_page(self, response):
with open("info.html","wb") as filename:
filename.write(response.body)
下载图片:
运行时报错: File "/usr/lib64/python3.4/site-packages/scrapy/pipelines/images.py", line 15, in <module>
from PIL import Image
ImportError: No module named 'PIL'
解决办法:pip3 install pillow
vi items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class DouyuItem(scrapy.Item):
# define the fields for your item here like:
nickname = scrapy.Field()
imagelink = scrapy.Field()
imagepath = scrapy.Field()
vi pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
from scrapy.utils.project import get_project_settings
from scrapy.pipelines.images import ImagesPipeline
import json
import os
class DouyuPipeline(ImagesPipeline):
#获取settings中的常量
IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
def get_media_requests(self, item, info):
image_url = item['imagelink']
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
print(results)
#exit(1)
image_path = [x['path'] for ok, x in results if ok]
os.rename(self.IMAGES_STORE+"/"+image_path[0],self.IMAGES_STORE+"/"+item['nickname']+".jpg")
item['imagepath'] = self.IMAGES_STORE+"/"+item['nickname']
return item
vi spiders/douyuavatar.py
# -*- coding: utf-8 -*-
import scrapy
from douyu.items import DouyuItem
import json
class DouyuavatarSpider(scrapy.Spider):
name = 'douyuavatar'
allowed_domains = ['capi.douyucdn.cn']
url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
offset=0
#start_urls = ['http://tencent.com/']
start_urls = [url+str(offset)]
def parse(self, response):
#把json转换为python格式
data = json.loads(response.text)['data']
#print(data)
#exit(1)
for value in data:
item = DouyuItem()
item['nickname'] = value['nickname']
item['imagelink'] = value['vertical_src']
yield item
if self.offset < 50:
self.offset += 20
yield scrapy.Request(self.url + str(self.offset), callback = self.parse)