1. 生成器
-
- yield作用 : 把一个函数当做一个生成器使用
-
- 斐波那契数列:
def fib(n):
a, b, s = 0, 1, 0
while s<n:
a, b = b, a+b
s += 1
yield b
# print(fib(5).__next__())
for i in fib(5):
print(i)
-
- yield特点: 让函数停下来,等待下一次调用
2. 项目: CSDN
-
- 知识点:对 yield、pipline的使用
-
- 抓取目标: 标题、发表时间、阅读数
-
- 步骤:
- 创建项目
$ scrapy startproject CSDN
- 定义数据结构(item.py)
class CSDNItem(scrapy.Item): name = scrapy.Field() time = scrapy.Field() number = scrapy.Field()
- 创建爬虫程序
csdn.py$ cd ~/spiders $ scrapy genspider csdns blog.csdn.net $ subl csdn.py
class CSDNSpider(scrapy.Spider): # 爬虫名, 运行时使用 name = 'csdn' # 域 allowed_domains = ['blog.csdn.net'] # 起始url start_urls = ['https://blog.csdn.net/qq_42231391/article/details/83506181'] from CSDN.items import CsdnItem def parse(self, response): item = CsdnItem() # 用了text(), 输出的是不包含标签<h1></h1>,只包含文本信息的元素选择器的对象, # 需要extract()将选择器的内容转成unicode字符串 item['name'] = response.xpath("//h1[@class='title-article']/text()").extract()[0] # response.xpath("...") ---> [<selector ... data="<div>...</div>"] # response.xpath(".../text()") ---> [<selector ... data="文本信息"] # extract() : 把选择器对象中的文本取出来 ['文本内容'] item['time'] = response.xpath("//span[@class='time']/text()").extract()[0] item['number'] = response.xpath("//span[@class='read-count']/text()").extract()[0] yeild item
- 第3步抓取的数据通过项目管道去处理
piplines.py
settings.pyclass CsdnPipeline(object): def process_item(self, item, spider): print('================') print(item['name']) print(item['time']) print(item['number']) print('****************')
ROBOTSTXT_OBEY = False DEFAULT_REQUEST_HEADERS = { 'User-Agent':'Mozilla/5.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } ITEM_PIPELINES = { 'CSDN.pipelines.CsdnPipeline': 300, }
- 运行爬虫程序
$ scrapy crawl csdn
- 创建项目
- 步骤:
3. 项目 : Daomu 盗墓笔记
-
- 目标
书名、书的标题、章节名称、章节数量、章节链接
- 目标
-
- 步骤
- 创建项目
scrapy startproject Daomu
- 改 item.py
class DaomuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() bookName = scrapy.Field() bookTitle = scrapy.Field() chapter = scrapy.Field() chapterNum = scrapy.Field() chapterLink = scrapy.Field()
- 创建爬虫文件
daomu.pycd spiders/ scrapy genspider daomu dapmubiji.com subl daomu.py
# -*- coding: utf-8 -*- import scrapy from Daomu.items import DaomuItem class DaomuSpider(scrapy.Spider): name = 'daomu' allowed_domains = ['dapmubiji.com'] start_urls = ['http://www.daomubiji.com/dao-mu-bi-ji-1'] def parse(self, response): item = DaomuItem() item['bookName'] = response.xpath("//h1[@class='focusbox-title']/text()").extract()[0] # 匹配所有章节信息(基准xpath) articles = response.xpath("//article[@class='excerpt excerpt-c3']") for article in articles: info = article.xpath("./a/text()").extract()[0].split(' ') # info ---> ['七星鲁王', '第一章', '门'] item['bookTitle'] = info[0] item['chapter'] = info[2] item['chapterNum'] = info[1] item['chapterLink'] = article.xpath('./a/@href').extract() yield item
- 改 pipelines.py,
class DaomuPipeline(object): def process_item(self, item, spider): print('---------------') print(item) print('*' * 10)
- 配置 settings.py
ROBOTSTXT_OBEY = False DEFAULT_REQUEST_HEADERS = { 'User-Agent':'Mozilla/5.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } ITEM_PIPELINES = { 'Daomu.pipelines.DaomuPipeline': 300, }
- 运行爬虫
scrapy crawl daomu
- 创建项目
- 步骤
4. 知识点
-
- extract() :获取选择器对象中的文本内容
response.xpath("...") ---> [<selector ... data="<div>...</div>"] response.xpath(".../text()") ---> [<selector ... data="文本信息"] extract() : 把选择器对象中的文本取出来 ['文本内容']
- extract() :获取选择器对象中的文本内容
-
- 爬虫程序中的 start_urls 必须为列表
start_urls = []
- 爬虫程序中的 start_urls 必须为列表
-
- piplines.py 中必须有1个叫做 process_item(self, item, spider) 的函数,且函数参数保持一致。
class DaomuMongoPipeline(object): def __init__(self): pass def process_item(self, item, spider): return item
- piplines.py 中必须有1个叫做 process_item(self, item, spider) 的函数,且函数参数保持一致。
-
- 数据若要保存在MongoDB时,需要在 settings.py 定义MongoDB变量
MONGODB_HOST = '127.0.0.1' MONGODB_PORT = 27017
- 数据若要保存在MongoDB时,需要在 settings.py 定义MongoDB变量
-
- 在 pipelines.py 中导入 settings模块
from Daomu import settings class DaomuMongoPipeline(object): def __init__(self): # __init__() 方法外不能通过self.host等形式调用如下变量 host = settings.MONGODB_HOST port = settings.MONGODB_PORT conn = pymongo.MongoClient(host=host, port=port) db = conn.DaomuDB self.myset = db.daomubiji def process_item(self, item, spider): # 注意 item 是一个字典格式的对象类型,使用前需要转为 python字典类型 bookInfo = dict(item) self.myset.insert(bookInfo) print('存入数据成功')
- 注意: DaomuMongoPipeline() 需要加入到 ITEM_PIPELINES 中,优先级设为100
ITEM_PIPELINES = { 'Daomu.pipelines.DaomuMongoPipeline': 100, }
- 注意: DaomuMongoPipeline() 需要加入到 ITEM_PIPELINES 中,优先级设为100
- 在 pipelines.py 中导入 settings模块
-
- 数据若要保存在MySQL时,需要在 settings.py 定义MySQL变量
MYSQL_HOST = 'localhost' MYSQL_USER = 'root' MYSQL_PWD = '123456' MYSQL_DB = 'DaoMuDB'
- 数据若要保存在MySQL时,需要在 settings.py 定义MySQL变量
-
- 在 pipelines.py 中导入 settings模块
from Daomu import settings import pymysql class DaomuMySQLPipeline(object): def __init__(self): # __init__() 方法外不能通过self.host等形式调用如下变量 host = settings.MYSQL_HOST user = settings.MYSQL_USER pwd = settings.MYSQL_PWD dbName = settings.MYSQL_DB self.db = pymysql.connect(host=host, user=user, password=pwd, db=dbName, charset='utf8') self.cursor = self.db.cursor() def process_item(self, item, spider): ins = 'insert into daomubiji values(%s, %s, %s, %s, %s,)' L = [item['bookName'], item['bookTitle'], item['chapter'], item['chapterNum'], item['chapterLink']] self.cursor.execute(ins, L) self.db.commit() print('存入数据成功')
- 注意: DaomuMySQLPipeline() 需要加入到 ITEM_PIPELINES 中,优先级设为200
ITEM_PIPELINES = { 'Daomu.pipelines.DaomuMySQLPipeline': 200, }
- 注意: DaomuMySQLPipeline() 需要加入到 ITEM_PIPELINES 中,优先级设为200
- 在 pipelines.py 中导入 settings模块
5. 项目: 腾讯招聘网站的职位信息(csv版)
-
- xpath 匹配
- 基准xpath表达式(每个职位节点对象)
//tr[@class='odd'] | //tr[@class='even']
- 抓取目标:
职位名称 :./td[1]/a/text()
详情链接 :./td[1]/a/@href
职位类别 :./td[2]/text()
招聘人数 :./td[3]/text()
工作地点 :./td[4]/text()
发布时间 :./td[5]/text()
- 基准xpath表达式(每个职位节点对象)
- xpath 匹配
-
- 编码实现
- 创建项目
$ scrapy startproject tencent
- 编辑 item.py 文件
import scrapy class TecentItem(scrapy.Item): # 职位名称 zhName = scrapy.Field() # 详情连接 zhLink = scrapy.Field() # 职位类别 zhType = scrapy.Field() # 招聘人数 zhNum = scrapy.Field() # 工作地点 zhAddress = scrapy.Field() # 发布时间 zhTime = scrapy.Field()
- 创建爬虫模块文件 tecent.py
# -*- coding: utf-8 -*- import scrapy from Tecent.items import TecentItem class TencentSpider(scrapy.Spider): name = 'tencent' allowed_domains = ['hr.tencent.com'] # 定义基准url, 方便后续做url拼接 url = 'https://hr.tencent.com/position.php?&start=' offset = 0 # start_urls 只是最开始的url start_urls = [url + str(offset)] def parse(self, response): # 做循环, 将拼接完成的url交给引擎处理 # 把293页的URL依次传递给引擎, 引擎给调度器 for i in range(0, 2921, 10): yield scrapy.Request(self.url + str(i), callback=self.parseHtml) def parseHtml(self, response): # 每个职位节点对象列表 base_list = response.xpath("//tr[@class='odd'] | //tr[@class='even']") # 获取当前页面中的每一个<tr>节点对象 for base in base_list: item = TecentItem() item['zhName'] = base.xpath("./td[1]/a/text()").extract()[0] item['zhLink'] = base.xpath("./td[1]/a/@href").extract()[0] # 网页中存在缺少 职位类型 的信息,需要进行判断 zhType = base.xpath("./td[2]/text()").extract() if not zhType: item['zhType'] = '' else: item['zhType'] = zhType[0] item['zhNum'] = base.xpath("./td[3]/text()").extract()[0] item['zhAddress'] = base.xpath("./td[4]/text()").extract()[0] item['zhTime'] = base.xpath("./td[5]/text()").extract()[0] yield item
- 编辑 pipelines.py 文件, 将数据保存在 csv格式文件中
import csv class TecentPipeline(object): def process_item(self, item, spider): with open('zhaopin.csv', 'a', newline='') as f: L = [item['zhName'], item['zhLink'], item['zhType'], item['zhNum'], item['zhAddress'], item['zhTime']] writer = csv.writer(f) writer.writerow(L)
- 修改 settings.py 文件
ROBOTSTXT_OBEY = False DEFAULT_REQUEST_HEADERS = { 'User-Agent':'Mozilla/5.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } ITEM_PIPELINES = { 'Tecent.pipelines.TecentPipeline': 300, }
- 创建项目
- 编码实现
6. 项目: 腾讯招聘网站的职位信息(MongoDB版)
-
- 创建爬虫项目:
scrapy startproject Tencent_Mongo
- 创建爬虫项目:
-
- 编辑 item.py 文件,确定需要爬取的变量名
import scrapy class TencentMongoItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() zhName = scrapy.Field() zhLink = scrapy.Field() zhType = scrapy.Field() zhNum = scrapy.Field() zhAddress = scrapy.Field() zhTime = scrapy.Field()
- 编辑 item.py 文件,确定需要爬取的变量名
-
- 创建 爬块文件 tencent_mongo.py
scrapy genspider tencent_mongo hr.tencent.com
- 创建 爬块文件 tencent_mongo.py
-
- 编辑 tencent_mongo.py 文件
from Tencent_Mongo.items import TencentMongoItem class TencentMongoSpider(scrapy.Spider): name = 'tencent_mongo' allowed_domains = ['hr.tencent.com'] base_url = 'https://hr.tencent.com/position.php?&start=' start_urls = [base_url + '0'] def parse(self, response): for i in range(0, 2921, 10): yield scrapy.Request(url=self.base_url+str(i), callback=self.parseHtml) def parseHtml(self, response): item = TencentMongoItem() tr_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for tr in tr_list: item['zhName'] = tr.xpath("./td[1]/a/text()").extract()[0] item['zhLink'] = tr.xpath("//tr[@class='even']/td[1]/a/@href").extract()[0] zhType = tr.xpath("./td[2]/text()").extract() if not zhType: item['zhType'] = '' else: item['zhType'] = zhType[0] item['zhNum'] = tr.xpath("./td[3]/text()").extract()[0] item['zhAddress'] = tr.xpath("./td[4]/text()").extract()[0] item['zhTime'] = tr.xpath("./td[5]/text()").extract()[0] yield item
- 编辑 tencent_mongo.py 文件
-
- 配置 settings.py 文件
ROBOTSTXT_OBEY = False DEFAULT_REQUEST_HEADERS = { 'User-Agent': 'Mozilla/5.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } ITEM_PIPELINES = { 'Tencent_Mongo.pipelines.TencentMongoPipeline': 300, } # 定义MongoDB变量 MONGODB_HOST = '127.0.0.1' MONGODB_PORT = 27017
- 配置 settings.py 文件
-
- 编辑 pipelines.py 文件
from . import settings import pymongo class TencentMongoPipeline(object): def __init__(self): host = settings.MONGODB_HOST port = settings.MONGODB_PORT conn = pymongo.MongoClient(host=host, port=port) db = conn.TencentDB self.myset = db.tencent def process_item(self, item, spider): # item是一个字典格式的节点对象,需转换为Python字典类型 job_msg = dict(item) self.myset.insert(job_msg) print('数据存入成功')
- 编辑 pipelines.py 文件
7. 项目: 腾讯招聘网站的职位信息(MySQL版)
-
- 创建项目文件 Tencent_MySQL
scrapy startproject Tencent_MySQL
- 创建项目文件 Tencent_MySQL
-
- 编辑 items.py 文件,确定 爬取目标
import scrapy class TencentMysqlItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() zhName = scrapy.Field() zhLink = scrapy.Field() zhType = scrapy.Field() zhNum = scrapy.Field() zhAddress = scrapy.Field() zhTime = scrapy.Field()
- 编辑 items.py 文件,确定 爬取目标
-
- 创建 爬虫模块文件 tencent_mysql.py
scrapy genspider tencent_mysql hr.tencent.com
- 创建 爬虫模块文件 tencent_mysql.py
-
- 编辑 tencent_mysql.py 文件
# -*- coding: utf-8 -*- import scrapy from ..items import TencentMysqlItem class TencentMysqlSpider(scrapy.Spider): name = 'tencent_mysql' allowed_domains = ['hr.tencent.com'] base_url = 'https://hr.tencent.com/position.php?&start=' start_urls = [base_url + '0'] def parse(self, response): for i in range(0, 2921, 10): yield scrapy.Request(self.base_url + str(i), callback=self.parseHtml) def parseHtml(self, response): item = TencentMysqlItem() tr_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for tr in tr_list: item['zhName'] = tr.xpath("./td[1]/a/text()").extract()[0] item['zhLink'] = tr.xpath("//tr[@class='even']/td[1]/a/@href").extract()[0] zhType = tr.xpath("./td[2]/text()").extract() if not zhType: item['zhType'] = '' else: item['zhType'] = zhType[0] item['zhNum'] = tr.xpath("./td[3]/text()").extract()[0] item['zhAddress'] = tr.xpath("./td[4]/text()").extract()[0] item['zhTime'] = tr.xpath("./td[5]/text()").extract()[0] yield item
- 编辑 tencent_mysql.py 文件
-
- 配置 settings.py 文件
ROBOTSTXT_OBEY = False DEFAULT_REQUEST_HEADERS = { 'User-Agent' : 'Mozilla/5.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } ITEM_PIPELINES = { 'Tencent_MySQL.pipelines.TencentMysqlPipeline': 300, } # 定义MySQL变量 MYSQL_HOST = 'localhost' MYSQL_PORT = 3306 MYSQL_USER = 'root' MYSQL_PWD = '123456'
- 配置 settings.py 文件
-
-
编辑 pipelines.py 文件, 将数据保存至MySQL中
from . import settings import pymysql import warnings class TencentMysqlPipeline(object): def __init__(self): host = settings.MYSQL_HOST port = settings.MYSQL_PORT user = settings.MYSQL_USER pwd = settings.MYSQL_PWD self.db = pymysql.connect(host=host, port=port, user=user, password=pwd, charset='utf8') self.cursor = self.db.cursor() c_db = 'create database if not exists TencentDB charset="utf8"' u_db = 'use TencentDB' c_tab = "create table if not exists tencent(id int primary key auto_increment, zhName text, zhLink text, zhType varchar(30), zhNum varchar(30), zhAddress varchar(30), zhTime varchar(30)) charset='utf8'" warnings.filterwarnings('ignore') try: self.cursor.execute(c_db) self.cursor.execute(u_db) self.cursor.execute(c_tab) except Warning: pass def process_item(self, item, spider): ins = 'insert into tencent(zhName, zhLink, zhType, zhNum, zhAddress, zhTime) values(%s, %s, %s, %s, %s, %s)' L = [item['zhName'], item['zhLink'], item['zhType'], item['zhNum'], item['zhAddress'], item['zhTime']] self.cursor.execute(ins, L) self.db.commit() print('存入数据库完成')
行胜于言,实践出真知, 欢迎关注,共同学习
-