爬虫---爬取80s网站6页(所有)的喜剧电影2:

爬取80s网站6页(所有)的喜剧电影2:

以下是在创建的scrapy项目的中的更改:

在settings.py中:
# 1.伪装成浏览器
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'

# 2.将robot.txt注释掉
# ROBOTSTXT_OBEY = True

# 3.打开
ITEM_PIPELINES = {
    'homework.pipelines.HomeworkPipeline': 300,
}
在E:\爬虫spiders\dianying2.py中:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from ..items import HomeworkItem


class Dianying2Spider(CrawlSpider):
    name = 'dianying2'
    allowed_domains = ['www.80s.la']
    
    start_urls = ['http://www.80s.la/movie/list/2-----p1']
	#start_urls的url请求,返回的response
	#用response去匹配rules里面的allow,得到url再去请求,然后回调			#callback里的方法
    rules = (
        Rule(LinkExtractor(allow=r'/movie/list/2-----p\d+'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        print("*" * 55)
        # 查出所有的li
        li_list = response.xpath('//ul[@class="me1 clearfix"]/li')
        # 遍历li
        for li in li_list:
            # 取出电影名字
            movie_name = li.xpath('./h3/a/text()').extract_first().strip().strip("\n")
            # print(movie_name)
            # 取出电影的信息
            movie_information = li.xpath('./span[@class="tip"]/text()').extract_first().strip().strip("\n")
            # print(movie_information)
            # 取出电影的链接
            movie_link = "http://www.80s.la" + li.xpath('./a/@href').extract_first()
            # print(movie_link)
            # 实例化HomeworkItem类
            item = HomeworkItem()
            # 将item中的键赋值(item类似于字典)
            item["movie_name"] = movie_name
            item["movie_information"] = movie_information
            item["movie_link"] = movie_link
            # 回调并携带参数meta
            yield scrapy.Request(url=movie_link, callback=self.parse_item2, meta={"item2": item})

    def parse_item2(self, response):
        print("*" * 55)
        # print(response.meta["item2"])
        # 取出电影的迅雷下载链接
        movie_download_address = response.xpath('//ul[@class="dllist1"]/li[2]/span/span/a/@href').extract_first()
        # print(movie_download_address)
        # 将这里的item赋值为上面的item
        item = response.meta["item2"]
        # 将item中的键赋值(item类似于字典)
        item["movie_download_address"] = movie_download_address
        # print(item)
        # 传给管道
        yield item

整个项目使用创建scrapy genspider -t crawl 名字 网址

在item.py
import scrapy

class HomeworkItem(scrapy.Item):
	# ---------------1.下载到本地-----------------
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 电影名字
    movie_name = scrapy.Field()
    # 电影信息
    movie_information = scrapy.Field()
    # 电影链接
    movie_link = scrapy.Field()
    # 下载地址
    movie_download_address=scrapy.Field()
在pipelines.py中:
1.存到本地:
class HomeworkPipeline(object):
    # 在打开spider是调用
    def open_spider(self, spider):
        # 打开文件,以utf-8编码格式写入
        self.file = open("dianying.txt", mode="w", encoding="utf-8")

    def process_item(self, item, spider):
        # 将item转化为字典
        item_dict = dict(item)
        # 将字典转化为字符串
        item_str = str(item_dict)
        # item_str = json.dumps(item_dict)
        # 将字符串写入
        self.file.write(item_str + "\n" + "\n" + "\n")

    # 在关闭spider是调用
    def close_spider(self, spider):
        # 关闭file文件
        self.file.close()
2.存到mysql:
 class HomeworkPipeline(object):
    ---------------2.连接数据库-----------------
    def __init__(self):
        # 连接MySQL数据库
        self.connect = pymysql.connect(host='47.98.254.167', user='shao', password='1111', db='movie', port=3306)
        self.cursor = self.connect.cursor()

    def process_item(self, item, spider):
        # 往数据库里面写入数据
        self.cursor.execute(
            'insert into 80movie(movie_name,movie_information,movie_link,movie_download_address)VALUES ("{}","{}","{}","{}")'.format(item['movie_name'],item['movie_information'],item['movie_link'],item['movie_download_address']))
        self.connect.commit()
        return item
   # 关闭数据库
    def close_spider(self, spider):
        self.cursor.close()
        self.connect.close()     

3.存到mongodb:
# ---------------3.连接数据库Mongodb-----------------

def open_spider(self, spider):
    self.conn = MongoClient(host='127.0.0.1', port=27017)
    self.db = self.conn.movie

def process_item(self, item, spider):
    self.collection = self.db['80movie2']
    self.collection.insert(dict(item))
    return item
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值