思路
items编写需要爬取的数据
spider解析页面,返回items数据
piplines如何存储数据
添加主函数实现pycharm调用Scrapy
修改机器人协议
进入Scrapy虚拟环境
conda activate Scrapy
进入要创建项目的目录Scrapy这个自己选择,再输入命令选择创建项目
scrapy startproject Meiju
进入第Baidu目录,创建spider,spidername不能和项目名相同,网址会自动不全
cd Meiju
scrapy genspider Meijuspider meijutt.com
items.py
import scrapy
class MeijuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
state = scrapy.Field()
Meijuspider.py
# -*- coding: utf-8 -*-
import scrapy
from lxml import etree
from Meiju.items import MeijuItem
# from .Meiju.pipelines import MeijuPipeline
class MeijuspiderSpider(scrapy.Spider):
name = 'Meijuspider'
allowed_domains = ['meijutt.com']
start_urls = ['http://meijutt.com/new100.html']
def parse(self, response):
# print(response.body)
content = etree.HTML(response.body.decode('GBK'))
movies = content.xpath('//ul[@class="top-list fn-clear"]/li')
for movie in movies:
# print(movie)
#美剧名
a_list = movie.xpath('./h5/a')
a = a_list[0].text
#美剧更新状态
stars = movie.xpath('.//span[@class="state1 new100state1"]/font')[0].text
# print(a,stars)
item = MeijuItem()
item['name'] = a
item['state'] = stars
print(a,'-----',stars)
#使用yield返回数据
yield item
piplines.py
import json
class MeijuPipeline(object):
def __init__(self):
self.file = open('meiju.json','w',encoding='utf-8')
def process_item(self, item, spider):
#存储数据
json.dump(dict(item),open('meiju.json','a'),ensure_ascii=False)
return item
def claso_spider(self):
self.file.close()
main.py 这个是自己在Meiju目录下创建的
from scrapy import cmdline
#输入命令
cmdline.execute('scrapy crawl Meijuspider'.split())
settings.py 修改机器人协议和piplines
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'Meiju.pipelines.MeijuPipeline': 300,
}
执行main.py 就能得到结果
meiju.json
{"name": "失联第二季", "state": "第1集"}{"name": "海军罪案调查处第十六季", "state": "第17集"}{"name": "麻烦一家人第一季", "state": "第12集"}{"name": "富家穷路第五季", "state": "第12集"}{"name": "金氏便利店第三季", "state": "第12集"}{"name": "联邦调查局第一季", "state": "第17集"}{"name": "菜鸟老警第一季", "state": "第17集"}{"name": "奇迹缔造者第一季", "state": "第7集"}{"name": "德里女孩第二季", "state": "第4集"}{"name": "现世末日第一季", "state": "第7集"}{"name": "紧急呼救第二季", "state": "第12集"}{"name": "东邻西舍第一季", "state": "第18集"}{"name": "宵禁第一季", "state": "第5集"}{"name": "格里斯城第一季", "state": "第8集"}{"name": "爆笑超市第四季", "state": "第12集"}{"name": "骑士陨落第二季", "state": "第1集"}{"name": "老爸有招第三季", "state": "第8集"}{"name": "与敌共谋第一季", "state": "第5集"}{"name": "天佑吾王第一季", "state": "第30集"}{"name": "弥补第一季", "state": "第2集"}{"name": "绿箭侠第七季", "state": "第17集"}{"name": "新圣女魔咒第一季", "state": "第14集"}{"name": "犯罪日记:暗杀科洛西奥第一季", "state": "第8集"}{"name": "伦敦生活第二季", "state": "第4集"}{"name": "暗影猎人第三季", "state": "第15集"}{"name": "穷山恶水第三季", "state": "第10集"}{"name": "豪门恩怨第二季", "state": "第14集"}{"name": "亡命天涯第一季", "state": "第7集"}{"name": "能源钱景第三季", "state": "第8集"}{"name": "街头法律第一季", "state": "第3集"}{"name": "公关第一季", "state": "第3集"}{"name": "罗斯威尔第一季", "state": "第9集"}{"name": "黑霹雳第二季", "state": "第16集"}{"name": "燃呀马德里第一季", "state": "第3集"}{"name": "亿万第四季", "state": "第2集"}{"name": "住院医师第二季", "state": "第18集"}{"name": "新版夏威夷神探第一季", "state": "第19集"}{"name": "庭审专家第三季", "state": "第18集"}{"name": "美国众神第二季", "state": "第3集"}{"name": "好女孩第二季", "state": "第4集"}{"name": "变身小姐", "state": "第16集"}{"name": "女子监狱比利时版第一季", "state": "第8集"}{"name": "叛国者第一季", "state": "第6集"}{"name": "美丽性世界第一季", "state": "第7集"}{"name": "致命标记第二季", "state": "第1集"}{"name": "霹雳娇娃第一季", "state": "第23集"}{"name": "码头第一季", "state": "第3集"}{"name": "巴普蒂斯特第一季", "state": "第6集"}{"name": "国务卿女士第五季", "state": "第17集"}{"name": "无辜第一季", "state": "第4集"}{"name": "大哥大姐没出息第一季", "state": "第9集"}{"name": "初来乍到第五季", "state": "第18集"}{"name": "德里罪案第一季", "state": "第7集"}{"name": "行尸走肉第九季", "state": "第15集"}{"name": "女超人第四季", "state": "第16集"}{"name": "绝境第二季", "state": "第9集"}{"name": "先见之明第二季", "state": "第8集"}{"name": "急速蕾恩第一季", "state": "第7集"}{"name": "探长薇拉第一季", "state": "本季终"}{"name": "探长薇拉第二季", "state": "本季终"}{"name": "法律与秩序特殊受害者第二十季", "state": "第17集"}{"name": "谜案追踪:阿德南事件", "state": "第3集"}{"name": "上帝加我好友第一季", "state": "第17集"}{"name": "开心汉堡店第九季", "state": "第18集"}{"name": "单亲辣妈第二季", "state": "第9集"}{"name": "海军罪案调查处:洛杉矶第十季", "state": "第18集"}{"name": "恶搞之家第十七季", "state": "第16集"}{"name": "辛普森一家第三十季", "state": "第18集"}{"name": "老顽童第一季", "state": "第17集"}{"name": "誓言第二季", "state": "第2集"}{"name": "黑色星期一第一季", "state": "第9集"}{"name": "宝贝老板:重围商界第二季", "state": "第6集"}{"name": "鲁保罗变装皇后秀第十一季", "state": "第4集"}{"name": "这个警察有点烦第六季", "state": "第11集"}{"name": "爱国者第二季", "state": "第5集"}{"name": "大城小妞第五季", "state": "第9集"}{"name": "实习医生格蕾第十五季", "state": "第18集"}{"name": "梅森探案集第一季", "state": "第21集"}{"name": "法律与秩序第十五季", "state": "第9集"}{"name": "女作家与谋杀案第七季", "state": "第2集"}{"name": "十全八美第一季", "state": "第16集"}{"name": "安迪·麦克第二季", "state": "第21集"}{"name": "分久再合第二季", "state": "第15集"}{"name": "华斯比历险记第一季", "state": "第25集"}{"name": "吸血鬼遗产第一季", "state": "第15集"}{"name": "音乐玩家第三季", "state": "第18集"}{"name": "罪恶黑名单第六季", "state": "第12集"}{"name": "无言有爱第三季", "state": "第19集"}{"name": "疯狂前女友第四季", "state": "第14集"}{"name": "布莱切利四人组之旧金山第一季", "state": "第8集"}{"name": "反击第七季", "state": "第9集"}{"name": "不对等的爱情第二季", "state": "第3集"}{"name": "无罪证明第一季", "state": "第6集"}{"name": "盲点第四季", "state": "第16集"}{"name": "末日巡逻队第一季", "state": "第6集"}{"name": "魔法师第四季", "state": "第9集"}{"name": "星球大战:抵抗组织第一季", "state": "第21集"}{"name": "傲骨之战第三季", "state": "第2集"}{"name": "天桥风云第十七季", "state": "第2集"}{"name": "恶行第一季", "state": "第2集"}