本次的课程是学习scrapy 架构进行网页的爬取,之前也有从网络上的视频预览过scrapy的库,不过当时没有了解这么深入。 只知道可以用scrapy shell 快速调试爬取的内容是否正确,但并没有理解scrapy 的框架。
通过这次的直播课,对scrapy 框架有了进一步的了解,明白到它包含了不同的程序去负责把数据做爬取,输出等功能。
不过因为框架编程对我等小白还是很新的知识,只能照样画葫芦,没法全面认识它的各项功能。
main 文件:
#!/usr/bin/python
# encoding: utf-8
"""
@author: Samson
@file: main.py
@time: 2019/10/20 21:56
"""
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(["scrapy", "crawl", "baidu"]) # baidu 指的是 baidu.py 里面的name = "xxx"
items 文件:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BaiduTiebaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class TiebaItem(scrapy.Item):
title = scrapy.Field()
author = scrapy.Field()
post = scrapy.Field()
level = scrapy.Field()
postime = scrapy.Field()
settings 文件:
BOT_NAME = 'baidu_tieba'
SPIDER_MODULES = ['baidu_tieba.spiders']
NEWSPIDER_MODULE = 'baidu_tieba.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'baidu_tieba.pipelines.BaiduTiebaPipeline': 1,
}
pipelines 文件:
from openpyxl import Workbook
class BaiduTiebaPipeline(object):
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
self.ws.append(["title", "author", "post", "level", "postime"])
def process_item(self, item, spider):
line = [item["title"], item["author"], item["post"], item["level"], item["postime"]]
self.ws.append(line)
self.wb.save('SpiderResult.xlsx')
return item
baidu 文件:
# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['tieba.baidu.com'] # 只要域名,就是 xxx.com
start_urls = ['https://tieba.baidu.com/f/good?kw=%B7%C0%D5%A9%C6%AD'] # 最后不要有斜线 /
def parse(self, response):
# 获取帖子的 url
url_list = response.css(".j_th_tit::attr(href)").extract()
print(url_list)
# 逐个url进行访问
for url in url_list:
yield scrapy.Request(url=parse.urljoin(response.url, url), callback= self.parse_detail)
# parse_detail就是等于帖子的详情页面解析
def parse_detail(self, response):
# 帖子标题
title = response.css(".core_title_txt.pull-left.text-overflow ::text").extract()
if title:
# 回帖内容
post_list = response.css(".d_post_content.j_d_post_content::text").extract()
# 作者
author_list = response.css(".p_author_name.j_user_card::text").extract()
# 楼层
level = response.css(".tail-info::text").extract()
for item in level:
if item == "来自":
level.remove(item)
level_list = []
for i in range(0, len(level), 2):
level_list.append(level[i])
# 发帖时间
postime = response.css(".tail-info::text").extract()
for item in postime:
if item == "来自":
postime.remove(item)
postime_list = []
for i in range(1, len(postime), 2):
postime_list.append(postime[i])
tieba_item = {}
for i in range(len(post_list)):
tieba_item["title"] = title[0]
tieba_item["author"] = author_list[i]
tieba_item["post"] = post_list[i]
tieba_item["level"] = level_list[i]
tieba_item["postime"] = postime_list[i]
yield tieba_item