Scrapy结合MongoDB的简单应用
1.准备工作
首先需要安装Pycharm,Mongodb并导入pymongo模块
可以安装可视化工具也可以不用,shell里面也能看
2.开淦
实例为爬取某网站小说并进行入库操作
items.py:
import scrapy
class HomeworkItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
chapter_names = scrapy.Field()
contents = scrapy.Field()
chapter_url = scrapy.Field()
piplines.py:
import pymongo
from scrapy.utils.project import get_project_settings
class HomeworkPipeline(object):
def __init__(self):
# # csv文件的位置,无需事先创建
# store_file = os.path.dirname(__file__) + '/spiders/douloudalu.csv'
# # 打开(创建)文件
# self.file = open(store_file, 'w')
# # csv写法
# self.writer = csv.writer(self.file)
settings = get_project_settings()
host = settings["MONGODB_HOST"]
port = settings["MONGODB_PORT"]
dbname = settings["MONGODB_DBNAME"]
sheetname = settings["MONGODB_SHEETNAME"]
# 创建MONGODB数据库链接
client = pymongo.MongoClient(host=host, port=port)
# 指定数据库
mydb = client[dbname]
# 存放数据的数据库表名
self.post = mydb[sheetname]
def process_item(self, item, spider):
# chapter_names = item["chapter_names"]
# chapter_url = item["chapter_url"]
# contents = item["contents"]
data = dict(item)
self.post.insert(data)
return item
# 判断字段值不为空再写入文件
# self.writer.writerow(['title','url','contents'])
# if chapter_names and chapter_url and contents:
# self.writer.writerow((chapter_names,chapter_url,contents))
# return item
def close_spider(self, spider):
self.client.close()
# 关闭爬虫时顺便将文件保存退出
# self.file.close()
settings.py:
# MONGODB 主机名
MONGODB_HOST = "127.0.0.1"
# MONGODB 端口号
MONGODB_PORT = 27017
# 数据库名称
MONGODB_DBNAME = "Douluodalu"
# 存放数据的表名称
MONGODB_SHEETNAME = "douluocontents"
之后运行爬虫结果如下:
通过mongo中输入命令来查看结果