scrapy 爬虫——百度贴吧
爬取了百度贴吧凡人修仙传的吧友留言。
settings .py
SPIDER_MODULLES = ['stone.spiders']
NEWSPIDER_MODULE = 'stone.spiders'
LOG_LEVEL = "WARNING" #去除警告信息
USER_AGENT = "USER_AGENT = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"
ROBOTSTXT_OBEY = False #不遵守君子协议
ITEM_PIPELINES = {
'stone.pipelines.StonePipeline':300 #设置Pipeline,定义存储
}
pipelines.py
import pymysql
class StonePipeline(object):
def __init__(self):
self.conn = pymysql.connect('localhost','root','root',3306) #连接mysql
self.cursor = self.conn.cursor() #获取执行sql语句的游标
def process_item(self,item,spider):
for i in range(len(item['title'])):
self.cursor.execute(
"""insert into tieba(title,info)
value(%s,%s);""",(item['title'][i],
item['info'][i])
)
self.conn.commit()
def close_spider(self,spider):
self.cursor.close() # 关闭游标
self.conn.close() #关闭mysql 连接
item.py
import scrapy
class StoneItem(scrapy.Item):
title = scrapy.Field() #定义字段,爬取的吧友昵称
info = scrapy.Field() # 具体的留言信息
iyingdi.py
import scrapy
from ..items import StoneItem
class IyingdiSpider(scrapy.Spider):
name = 'iyingdi'
allowed_domains = ['tieba.baidu.com']
start_urls = ['https://tieba.baidu.com/p/7442853989']
def parse(self,response):
item = StoneItem()
item['title'] = response.xpath('//ul[@class="p_author"]/li[@class="d_name"]/a/text()').extract()
item['info']=response.xpath('//cc/div/text()').extract()
for i in item['info']:
if i == "该楼层疑似违规已被系统折叠\xa0":
item['info'].remove(i)
for i in item['info']:
print(i)
for n in item['title']:
print(n)
yield item
start_url = "https://tieba.baidu.com"
reurl = response.xpath('//li[@class="l_pager pager_theme_5 pb_list_pager"]/a')
for url in reurl:
print(url.xpath('./text()'))
if url.xpath('./text()').extract_first() == "下一页":
urls = url.xpath('./@href').extract_first()
print(urls)
yield scrapy.Request(start_url+urls,callback=self.parse,dont_filter=True)
爬取结果
遇到的问题
- 爬取过程中不能识别特殊字符,导致后面的数据(本页)都不能继续爬取,只能跳过。
以及昵称中间位置有图片会将昵称分为两部分,导致数据部分列表下标溢出。
-
在使用xpath 进行二次解析时,爬取到的数据一直为空。因为url.xpath(’/text()’)在’/'前忘记加‘.’ 表示在当前位置解析。
-
start_urls 一定要使用列表