爬取80s网站6页(所有)的喜剧电影2:
以下是在创建的scrapy项目的中的更改:
在settings.py中:
# 1.伪装成浏览器
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
# 2.将robot.txt注释掉
# ROBOTSTXT_OBEY = True
# 3.打开
ITEM_PIPELINES = {
'homework.pipelines.HomeworkPipeline': 300,
}
在E:\爬虫spiders\dianying2.py中:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import HomeworkItem
class Dianying2Spider(CrawlSpider):
name = 'dianying2'
allowed_domains = ['www.80s.la']
start_urls = ['http://www.80s.la/movie/list/2-----p1']
#start_urls的url请求,返回的response
#用response去匹配rules里面的allow,得到url再去请求,然后回调 #callback里的方法
rules = (
Rule(LinkExtractor(allow=r'/movie/list/2-----p\d+'), callback='parse_item', follow=True),
)
def parse_item(self, response):
print("*" * 55)
# 查出所有的li
li_list = response.xpath('//ul[@class="me1 clearfix"]/li')
# 遍历li
for li in li_list:
# 取出电影名字
movie_name = li.xpath('./h3/a/text()').extract_first().strip().strip("\n")
# print(movie_name)
# 取出电影的信息
movie_information = li.xpath('./span[@class="tip"]/text()').extract_first().strip().strip("\n")
# print(movie_information)
# 取出电影的链接
movie_link = "http://www.80s.la" + li.xpath('./a/@href').extract_first()
# print(movie_link)
# 实例化HomeworkItem类
item = HomeworkItem()
# 将item中的键赋值(item类似于字典)
item["movie_name"] = movie_name
item["movie_information"] = movie_information
item["movie_link"] = movie_link
# 回调并携带参数meta
yield scrapy.Request(url=movie_link, callback=self.parse_item2, meta={"item2": item})
def parse_item2(self, response):
print("*" * 55)
# print(response.meta["item2"])
# 取出电影的迅雷下载链接
movie_download_address = response.xpath('//ul[@class="dllist1"]/li[2]/span/span/a/@href').extract_first()
# print(movie_download_address)
# 将这里的item赋值为上面的item
item = response.meta["item2"]
# 将item中的键赋值(item类似于字典)
item["movie_download_address"] = movie_download_address
# print(item)
# 传给管道
yield item
整个项目使用创建scrapy genspider -t crawl 名字 网址
在item.py
import scrapy
class HomeworkItem(scrapy.Item):
# ---------------1.下载到本地-----------------
# define the fields for your item here like:
# name = scrapy.Field()
# 电影名字
movie_name = scrapy.Field()
# 电影信息
movie_information = scrapy.Field()
# 电影链接
movie_link = scrapy.Field()
# 下载地址
movie_download_address=scrapy.Field()
在pipelines.py中:
1.存到本地:
class HomeworkPipeline(object):
# 在打开spider是调用
def open_spider(self, spider):
# 打开文件,以utf-8编码格式写入
self.file = open("dianying.txt", mode="w", encoding="utf-8")
def process_item(self, item, spider):
# 将item转化为字典
item_dict = dict(item)
# 将字典转化为字符串
item_str = str(item_dict)
# item_str = json.dumps(item_dict)
# 将字符串写入
self.file.write(item_str + "\n" + "\n" + "\n")
# 在关闭spider是调用
def close_spider(self, spider):
# 关闭file文件
self.file.close()
2.存到mysql:
class HomeworkPipeline(object):
---------------2.连接数据库-----------------
def __init__(self):
# 连接MySQL数据库
self.connect = pymysql.connect(host='47.98.254.167', user='shao', password='1111', db='movie', port=3306)
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
# 往数据库里面写入数据
self.cursor.execute(
'insert into 80movie(movie_name,movie_information,movie_link,movie_download_address)VALUES ("{}","{}","{}","{}")'.format(item['movie_name'],item['movie_information'],item['movie_link'],item['movie_download_address']))
self.connect.commit()
return item
# 关闭数据库
def close_spider(self, spider):
self.cursor.close()
self.connect.close()
3.存到mongodb:
# ---------------3.连接数据库Mongodb-----------------
def open_spider(self, spider):
self.conn = MongoClient(host='127.0.0.1', port=27017)
self.db = self.conn.movie
def process_item(self, item, spider):
self.collection = self.db['80movie2']
self.collection.insert(dict(item))
return item