接上:
初学scrapy框架遇到的坑(上)
初学scrapy遇到的坑(中)
在前面两个中已经爬取了博客的标题和链接,在这里继续前面的步骤,开始爬取博客的内容部分。
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from ..items import BlogScrapyItem
class BlogSpider(scrapy.Spider):
name = 'blog'
allowed_domains = ['https://www.cnblogs.com/']
start_urls = ['https://www.cnblogs.com/']
def parse(self, response):
content=response.text
soup=BeautifulSoup(content,'lxml')
targets=[]
titles=soup.find_all('a',class_='titlelnk')
length=len(titles)
for i in range(length):
target=BlogScrapyItem()
title=titles[i].text
link=titles[i]['href']
#变成字典
target["title"]=title
target["link"]=link
targets.append(target)
return targets
修改这里面的代码。
以上是很久之前所写。
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from ..items import BlogScrapyItem
class BlogSpider(scrapy.Spider):
name = 'blog'
#allowed_domains = ['https://www.cnblogs.com/']
start_urls = ['https://www.cnblogs.com/']
def parse(self, response):
content=response.text
soup=BeautifulSoup(content,'lxml')
targets=[]
titles=soup.find_all('a',class_='post-item-title')
cons=soup.find_all('p',class_='post-item-summary')
length=len(titles)
for i in range(length):
target=BlogScrapyItem()
title=titles[i].text
link=titles[i]['href']
con=cons[i].text.strip()
print('第%s篇博客的title为:%s' % (i + 1, title))
print('链接:%s' % link)
print(con)
# 变成字典
target["title"] = title
target["link"] = link
target["con"]=con
targets.append(target)
return targets
这篇文章与上两篇相隔太久,网页竟然有所变动。
抓取的代码与上面有所不同
修改pipelines.py代码
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class BlogScrapyPipeline:
path='F:/pycharm文件/document/target.csv'
def __init__(self):
self.mytarget=open(self.path,'a+',encoding='utf-8')
def process_item(self, item, spider):
title=item["title"]
link=item["link"]
con=item["con"]
content=title+'\n'+link+'\n'+con
self.mytarget.write(content)
return item
运行结果
看一下本地目录
爬取完成。