第四周爬虫进度记录
前言
本周对今日头条和新浪新闻网站进行了爬取,并且对于爬虫爬取数据的储存做了优化,并且对于爬虫的文件名字等作了统一化处理,方便后续的模块的引用
一、今日头条的爬取
今日头条与其他的网站新闻页面结构不太相同,因为头条新闻是向下滑动滚动条才能动态加载出相应的新闻。所以我们在获取网页源码时,需要相应的多次向下滑动滚动条到达底端后方才获取相应的网页源码进行分析。我们采用scrapy+selenium结合的方式爬取今日头条的新闻,以致使爬虫模块的结构保持一致。
1.item.py的编写
import scrapy
class SpiderHeadlineItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
source = scrapy.Field()
timestamp = scrapy.Field()
detail = scrapy.Field()
2.在setting.py文件下开启中间件
DOWNLOADER_MIDDLEWARES = {
'Spider_headline.middlewares.SpiderHeadlineDownloaderMiddleware': 543,
}
3.编辑middlewares.py文件(下拉滚动条)
def process_request(self, request, spider):
if spider.middle_control == '一级':
try:
spider.browser.get(request.url)
# 拉动滚轴使页面加载底端的页面元素
for i in range(9):
spider.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
sleep(2)
except TimeoutException as e:
print('超时')
spider.browser.execute_script('window.stop()')
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source,
encoding="utf-8", request=request)
else:
try:
spider.browser.get(request.url)
except TimeoutException as e:
print('超时')
spider.browser.execute_script('window.stop()')
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source,
encoding="utf-8", request=request)
4.编写myspider.py文件
import scrapy
from selenium import webdriver
from ..items import SpiderHeadlineItem
import os
import pandas as pd
class MyspiderSpider(scrapy.Spider):
name = 'myspider'
middle_control = '一级'
stop_words = []
# 文件的位置
store_file = os.path.dirname(__file__) + '/IrrelevantWords.txt'
data = pd.read_csv(store_file, header=None, sep=".")
for i in range(data.shape[1]):
if data[i][0] not in stop_words:
stop_words.append(data[i][0])
def __init__(self):
# 让网页不加载图片
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
self.browser = webdriver.Chrome(options=chrome_options)
self.browser.set_page_load_timeout(30)
def closed(self, spider):
print("spider closed")
self.browser.close()
def start_requests(self):
url = 'https://www.toutiao.com/search/?keyword=%E5%B1%B1%E4%B8%9C%E5%A4%A7%E5%AD%A6'
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
title_list = []
for each in response.xpath("//div[@class='sections']//div[@class='articleCard']"):
title = each.xpath("div[@class='item']//span[@class='J_title']//text()").extract()
whole_title = ''
for i in title:
whole_title = whole_title + i.strip(' ')
title_list.append(whole_title)
ex_href = 'https://www.toutiao.com'
url_list = response.xpath("//div[@class='sections']//div[@class='articleCard']//a[@class='link title']/@href").extract()
source_list = response.xpath(
"//div[@class='sections']//div[@class='articleCard']//div[@class='y-box footer']//a[@class='lbtn source J_source']//text()").extract()
timestamp_list = response.xpath(
"//div[@class='sections']//div[@class='articleCard']//div[@class='y-box footer']//span[@class='lbtn']//text()").extract()
for i in range(len(title_list)):
self.middle_control = '二级'
title = title_list[i]
url = ex_href + url_list[i]
source = source_list[i]
timestamp = timestamp_list[i]
detail = ''
item = SpiderHeadlineItem(title=title, url=url, source=source, timestamp=timestamp, detail=detail)
# yield scrapy.Request(url=url, meta={'item':item}, callback=self.parse_detail)
yield item
二、网易新闻的爬取
网易新闻与之前的网页新闻的爬取步骤基本一致
1.item.py文件
import scrapy
class SpiderWangyiItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
source = scrapy.Field()
timestamp = scrapy.Field()
detail = scrapy.Field()
2.编写myspider.py文件
import scrapy
import pandas as pd
from ..items import SpiderWangyiItem
import os
class MyspiderSpider(scrapy.Spider):
name = 'myspider'
# allowed_domains = ['163.com']
start_urls = ['https://cn.bing.com/search?q=%e5%b1%b1%e4%b8%9c%e5%a4%a7%e5%ad%a6+site%3anews.163.com&first=1&FORM=PERE1']
url = 'https://cn.bing.com/search?q=%e5%b1%b1%e4%b8%9c%e5%a4%a7%e5%ad%a6+site%3anews.163.com&first={}&FORM=PERE1'
page = 1
stop_words = []
# 文件的位置
store_file = os.path.dirname(__file__) + '/IrrelevantWords.txt'
data = pd.read_csv(store_file, header=None, sep=".")
for i in range(data.shape[1]):
if data[i][0] not in stop_words:
stop_words.append(data[i][0])
def parse(self, response):
# 获取标题
title_list = []
for each in response.xpath("//li[@class='b_algo']//h2"):
title = each.xpath("a//text()").extract()
whole_title = ''
for i in title:
whole_title = whole_title + i.strip(' ')
title_list.append(whole_title)
# 获取每个标题对应的url
url_list = response.xpath("//li[@class='b_algo']//h2//a/@href").extract()
# 获取新闻来源-网易新闻
source = '网易新闻'
# 获取新闻的时间戳
timestamp_list = [] # response.xpath("//li[@class='b_algo']//div[@class='b_caption']//p//text()").extract()
for each in response.xpath("//li[@class='b_algo']//div[@class='b_caption']"):
timestamp = each.xpath("p//text()").extract()
whole_timestamp = ''
for i in timestamp:
whole_timestamp = whole_timestamp + i.strip(' ')
timestamp_list.append(whole_timestamp)
for i in range(len(timestamp_list)):
temp_title = title_list[i]
temp_time = timestamp_list[i]
if '山东大学' in temp_title or '山大' in temp_title or '山东大学' in temp_time or '山大' in temp_time:
title = temp_title
url = url_list[i]
source = source
timestamp = temp_time
item = SpiderWangyiItem(title=title, url=url, source=source, timestamp=timestamp)
yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse_detail)
print('二级页面爬取完毕')
print('{}页一级页面爬取完毕'.format(self.page))
if self.page < 686:
if self.page == 1:
url = self.url.format(6)
self.page = 6;
yield scrapy.Request(url=url, callback=self.parse)
else:
self.page += 10;
url = self.url.format(self.page)
yield scrapy.Request(url=url, callback=self.parse)
def parse_detail(self, response):
item = response.meta['item']
total_message = ''
num = 0
num_list = []
for each in response.xpath("//a"):
message = each.xpath("@href").extract()
if len(message)!=0 and message[0]!='http://www.qq.com' and message[0]!='https://www.baidu.com':
num_list.append(num)
num += 1
num = 0
for each in response.xpath("//a"):
message = each.xpath("text()").extract()
if num in num_list:
if len(message)!=0 and message[0].replace(' ', '') not in self.stop_words:
total_message = total_message + message[0].replace(' ', '') + '.'
num += 1
item['detail'] = total_message
yield item
三、爬虫储存文件的优化
在之前的网页新闻的爬取时,我们使用的是终端输入命令scrapy crawl itcast -o teachers.csv的方法对于数据进行存储。现在,我们通过编辑pipelines.py文件对于数据进行存储,这样简化了操作以及方便对于每一个item进行单独操作。
1.在setting文件里面设置
ITEM_PIPELINES = {
# 'Spider_wangyi.pipelines.SpiderWangyiPipeline': 300,
'Spider_wangyi.pipelines.Pipiline_ToCSV': 300,
}
2.编写pipelines.py文件
class Pipiline_ToCSV(object):
def __init__(self):
#文件的位置
store_file = os.path.dirname(__file__) + '/spiders/wangyi.csv'
#打开文件,并设置编码
self.file = codecs.open(filename= store_file, mode= 'wb', encoding='utf_8_sig')
# 写入csv
self.writer = csv.writer(self.file)
def process_item(self, item, spider):
line = (item['title'], item['url'], item['source'], item['timestamp'], item['detail'])
# 逐行写入
self.writer.writerow(line)
return item
def close_spider(self, spider):
self.file.close()
总结
本周的工作已经完结了对于网页爬虫的工作,接下来将开展论坛爬虫工作,进行知乎和微博数据的爬取。