在掌握了python基础之后,使用scrapy爬虫应用框架进行爬虫
在PyCharm下创建python的工程。之后在terminal命令窗口中输入 pip install scrapy指令,安装scrapy。
验证是否安装成功,运行以下代码,例如请求百度的链接
import scrapy
html = scrapy.Request("https://www.baidu.com/")
print(html)
运行成功,则会出现下图的请求结果。
创建scrapy工程,在命令窗口输入scrapy startproject stock_spider,stock_spider为工程名。
在对应的工程目录下,找到工程名,并在spiders文件下创建文件stock.py,工程目录结果如下。
stock.py文件
# -*- coding: utf-8 -*-
import scrapy
#进行域名拼接的包
from urllib import parse#(包,库(函数))
#正则
import re
#导入item工程
from stock_spider.items import stockItem
#scrapy genspider stock pycs.greedyai.com/#创建爬虫链接程序
class StockSpider(scrapy.Spider):
name = 'stock'
allowed_domains = ['url.com/']
start_urls = ['http://url.com//']
def parse(self, response):
post_urls = response.xpath("//a/@href").extract()
#处理分页的思想
for post_url in post_urls:
#请求链接
yield scrapy.Request(url=parse.urljoin(response.url,post_url),callback=self.paese_detail,dont_filter=True)
#yield 相当于return。域名拼接,回调函数,链接不用进行过滤
#初始页面获取子链接,发送给回调函数进行处理
def paese_detail(self,response):
# #姓名
# names = self.get_tc(response)
# #性别
# sexes = self.get_sex(response)
# #年龄
# ages = self.get_age(response)
# #股票代码
# codes = self.get_code(response)
# #职位信息
# leaders = self.get_leader(response,len(names))
# 姓名
# 存进item
stock_item=stockItem()
stock_item["names"] = self.get_tc(response)
# 性别
stock_item["sexes"] = self.get_sex(response)
# 年龄
stock_item["ages"] = self.get_age(response)
# 股票代码
stock_item["codes"] = self.get_code(response)
# 职位信息
stock_item["leaders"] = self.get_leader(response, len(stock_item["names"]))
#文件存储逻辑
yield stock_item#提交过去
#pass 函数如果没有内容,使用pass可以防止报错
# print(1)
def get_tc(self,response):
tc_name = response.xpath("//*[@class=\"tc name\"]/a/text()").extract()
return tc_name
def get_sex(self,response):
infos = response.xpath("//*[@class=\"intro\"]/text()").extract()
sex_list = []
#在进行数据处理时,出现IndexError异常,添加异常处理
for info in infos:
try:
sex = re.findall("[男|女]",info)[0]#取数据中的性别,类型为序列,需要取值
sex_list.append(sex)
except(IndexError):
continue
return sex_list
def get_age(self,response):
infos = response.xpath("//*[@class=\"intro\"]/text()").extract()
age_list = []
for info in infos:
try:
age = re.findall("\d+", info)[0] # 一个或多个数字
age_list.append(age)
except(IndexError):
continue
return age_list
def get_code(self,response):
# response.xpath('/html/body/div[3]/div[1]/div[2]/div[1]/h1/a').extract()#copy链接中的xpath,或者通过id/class进行定位。
# response.xpath('/html/body/div[3]/div[1]/div[2]/div[1]/h1/a/@title').extract()#title中有需要的数据,直接用title属性
infos = response.xpath('/html/body/div[3]/div[1]/div[2]/div[1]/h1/a/@title').extract()
code_list=[]
for info in infos:
try:
code = re.findall("\d+", info)[0] # 一个或多个数字
code_list.append(code)
except(IndexError):
continue
return code_list
def get_leader(self,response,length):
tc_leanders = response.xpath("//*[@class=\"tl\"]/text()").extract()
tc_leanders = tc_leanders[0:length]
return tc_leanders
item.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class StockSpiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
#装从网页爬取下来的那些变量
class stockItem(scrapy.Item):
names = scrapy.Field()
sexes = scrapy.Field()
ages = scrapy.Field()
codes = scrapy.Field()
leaders = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os
class StockSpiderPipeline(object):
def process_item(self, item, spider):
return item
#需要在settings.py中设置,才能进入pipelines,取消下面注释
# #ITEM_PIPELINES = {
# 'stock_spider.pipelines.StockSpiderPipeline': 300,
# 'stock_spider.pipelines.stockPipeline': 1,
# }
class stockPipeline(object):
#构造函数
def __init__(self):
# 类被加载时创建一个存储文件
self.file = open("executive_prep.csv", "a+")#创建csv文件存取数据,a+设置权限
def process_item(self, item, spider):
#判断文件是否为空
if os.path.getsize("executive_prep.csv"):
#开始写文件
self.write_content(item)
else:
self.file.write("高管姓名,性别,年龄,股票代码,职位\n")#写头文件\n换行
self.file.flush()#解决数据没有及时刷新到文件磁盘中
# print(item)
return item
def write_content(self,item):
names = item["names"]
sexes = item["sexes"]
ages = item["ages"]
codes = item["codes"]
leaders = item["leaders"]
result = ""
for i in range(len(names)):
#拼接数据
result = names[i] + ","+sexes[i] + "," + ages[i] + "," + codes[0] + "," + leaders[i] + "\n"#换行
# print(result)
#写入文件
self.file.write(result)
Main.py
from scrapy.cmdline import execute
import sys
import os
#调试的写法
sys.path.append(os.path.dirname(os.path.abspath(__file__)))#
# execute(["scrapy","crawl","tonghuashun"])
execute(["scrapy","crawl","stock"])#调试的是stock文件
#遇到阻止爬虫的页面,需要在setting.py程序中添加
#USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'
爬虫生成的数据会出现在指定好的executive_prep.csv文件中,使用csv文件格式存储数据可以使用Excel直接打开。