【博主碎碎念】
由于在学习Scrapy时候,参考了某博主的实战案例,但是却遇到了一些问题,自己对代码进行了修改,现已解决,如遇到通用问题的你,可以参考一下。
实战参考原文链接:https://blog.csdn.net/pengjunlee/article/details/89500302。废话不多说,详情参考原文链接,我这直接贴代码(目录结构和原文一样)。
【代码】
1、定义Item.py
import scrapy
class BaiduTiebaItem(scrapy.Item):
title = scrapy.Field() # 帖子标题
url = scrapy.Field() # 帖子地址
content = scrapy.Field() # 帖子内容
post_date = scrapy.Field() # 发帖日期
reply_count = scrapy.Field() # 回复数
2、创建Spider——tieba.py
# -*- coding: utf-8 -*-
import scrapy
import re
#from lxml import etree
from lxml import html
from baidu_tieba.items import BaiduTiebaItem
etree = html.etree
class TiebaSpider(scrapy.Spider): # 贴吧爬虫类,继承自scrapy.Spider
name = 'tieba' # 爬虫名称,启动爬虫时使用:scrapy crawl <爬虫名称>
allowed_domains = ['tieba.baidu.com'] # 允许爬取的范围
start_urls = ['https://tieba.baidu.com/f?kw=%E5%85%A8%E6%A3%89%E6%97%B6%E4%BB%A3'] # 爬虫的起始地址
def parse(self, response): # 提取帖子列表页数据
# 由于目前帖子列表位于注释中,不能直接使用xpath,在此先使用正则进行提取
pattern = re.compile(
'<!--\n\n(<ul id="thread_list" class="threadlist_bright j_threadlist_bright">.*?)--></code>', re.S)
html_str = pattern.search(response.body.decode())
# 将提取的字符串内容转化成 Element 对象
html = etree.HTML(html_str.group(1))
# print(etree.tostring(html.xpath("//li")[0]))
# 提取到所有的帖子<li></li>元素,不包括广告
li_list = html.xpath(
"//ul[@id='thread_list']//li[@class=' j_thread_list clearfix']|//ul[@id='thread_list']//li[@class='thread_top_list_folder']")
for li in li_list:
item = BaiduTiebaItem()
item['reply_count'] = li.xpath("//span[@title='回复']/text()")[0]
item['title'] = li.xpath("//a[@class='j_th_tit ']/@title")[0]
item['url'] = li.xpath("//a[@class='j_th_tit ']/@href")[0]
item['url'] = 'https://tieba.baidu.com' + item['url']
item['content'] = li.xpath("//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()")[0]
item['post_date'] = li.xpath("//span[@class='pull-right is_show_create_time']/text()")[0]
yield item
# 获取帖子列表的下一页URL
next_url = html.xpath("./node()//a[@class='next pagination-item ']/@href")[0] if len(
html.xpath("./node()//a[@class='next pagination-item ']/@href")) > 0 else None
if next_url is not None:
# 重新构造请求获取下一页帖子列表
yield scrapy.Request('https:' + next_url, callback=self.parse)
3、编写Pipeline(pipeline.py)
# -*- coding: utf-8 -*-
import json
from baidu_tieba.items import BaiduTiebaItem
class BaiduTiebaPipeline(object): # 处理百度贴吧Item 的Pipeline
def process_item(self, item, spider):
if isinstance(item, BaiduTiebaItem): # 仅处理 BaiduTiebaItem,其他Item不予处理,直接返回
item['content'] = item['content'].strip() # 截取帖子内容的前后空格
# 将帖子内容保存到文件
with open('quanmian.txt', 'a', encoding='utf-8') as f:
json.dump(dict(item), f, ensure_ascii=False, indent=2)
return item
4、 settings.py
完成 BaiduTiebaPipeline 定义后,还需要在settings.py 中启用 Pipeline,它才会生效。
# -*- coding: utf-8 -*-
# Scrapy settings for wikidataCrawler project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'baidu_tieba'
SPIDER_MODULES = ['baidu_tieba.spiders']
NEWSPIDER_MODULE = 'baidu_tieba.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# ONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 0
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'wikidataCrawler.middlewares.WikidatacrawlerSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'scrapy_splash.SplashCookiesMiddleware': 723,
# 'scrapy_splash.SplashMiddleware': 725,
# 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
# }
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'baidu_tieba.pipelines.BaiduTiebaPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
LOG_LEVEL = 'WARNING'
# SPLASH_URL = 'http://localhost:8050'
# DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
5、scrapy.cfg
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = baidu_tieba.settings
[deploy]
#url = http://localhost:6800/
project = baidu_tieba
6、如果想在pycharm中运行爬虫的话,可以在baidu_tieba/baidu_tieba/的目录下,新建start.py。
from scrapy import cmdline
cmdline.execute('scrapy crawl tieba'.split())
到这就结束啦,启动爬虫,来爬取网页,行动起来吧!!!!