一、items.py:设定你爬取数据存储的字段名称。
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class TutorialItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
URL = scrapy.Field() #存放当前网页地址
TITLE = scrapy.Field() #存放当前网页title
H1 = scrapy.Field() #存放一级标题
TEXT = scrapy.Field() #存放正文
二、pipelines.py:把你抓取后的数据,存到文件里面的逻辑
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
class TutorialPipeline(object):
def __init__(self):
self.filename = open("content.txt",'w',encoding="utf-8")
self.contain = set() #定义集合用于去重
def process_item(self, item, spider):
text = json.dumps(dict(item),ensure_ascii=False) + '\n'
text_dict = eval(text)
if text_dict['URL'] not in self.contain: #抓取到新网页,并写入文件
for _,targetName in text_dict.items():
if "人" in targetName:
self.write_to_txt(text_dict)
break #避免重复写入
self.contain.add(text_dict['URL']) #每次记录文件后把网页url写入集合,重复的url会自动过滤掉
return item
def close_spider(self,spider):
self.filename.close()
def write_to_txt(self,text_dict):
#把抓取到的内容写入文件中
for key,value in text_dict.items():
self.filename.write(key+"内容:\n"+value+'\n')
self.filename.write(50*'='+'\n')
三、spider目录下的sohu.py:爬虫分析(parser的文件)
# -*- coding: utf-8 -*-
import scrapy
import re,os
from tutorial.items import TutorialItem
from scrapy import Request
#在文件主目录下执行抓取:#scrapy crawl sohu
class SohuSpider(scrapy.Spider):
name = 'sohu' #项目名称
# allowed_domains = ['www.sohu.com'] #如果指定爬虫作用范围,则作用于首页之后的页面
start_urls = ['http://www.sohu.com/'] #开始url
def parse(self, response):
all_urls = re.findall('href="(.*?)"',response.xpath("/html").extract_first())
for url in all_urls:
item = TutorialItem()
if re.findall("(\.jpg)|(\.jpeg)|(\.gif)|(\.ico)|(\.png)|(\.js)|(\.css)$",url.strip()):
pass #去掉无效链接
elif url.strip().startswith("http") or url.strip().startswith("//"):
temp_url = url.strip() if url.strip().startswith('http') else 'http:' + url.strip() #三目运算符获取完整网址
item = self.get_all(item,response)
#判断item中存在正文且不为空,页面一级标题不为空
if 'TEXT' in item and item['TEXT'] != '' and item['TITLE'] != '':
yield item #发送到管道
print('发送<' + temp_url + '>到下载器') #提示
yield Request(temp_url,callback=self.parse) #递归调用
def get_all(self,item,response):
#获取当前页面的网址、title、一级标题、正文内容
item['URL'] = response.url.strip()
item['TITLE'] = response.xpath('/html/head/title/text()').extract()[0].strip()
contain_h1 = response.xpath('//h1/text()').extract() #获取当前网页所有一级标题
contain= contain_h1[0] if len(contain_h1) !=0 else "" #获取第一个一级标题
item["H1"] = contain.strip()
main_text = []
#遍历网页中所有p标签和br标签的内容
for tag in ['p','br']:
sub_text = self.get_content(response,tag)
main_text.extend(sub_text)
#对正文内容去重并判断不为空
main_text = list(set(main_text))
if len(main_text) != 0:
item['TEXT'] = '\n'.join(main_text)
return item
def get_content(self,response,tag):
#判断只有大于100个文字的内容才保留
main_text = []
contexts = response.xpath('//'+tag+'/text()').extract()
for text in contexts:
if len(text.strip()) > 100:
main_text.append(text.strip())
return main_text
执行结果:E:\tutorial>scrapy crawl sohu