python爬虫（四）数据存储

最新推荐文章于 2022-10-26 00:53:27 发布

夜色的最黑暗。

最新推荐文章于 2022-10-26 00:53:27 发布

阅读量644

点赞数

分类专栏：爬虫文章标签： python 数据库 mysql sql

本文链接：https://blog.csdn.net/qq_42385761/article/details/109718260

版权

爬虫专栏收录该内容

9 篇文章 3 订阅

订阅专栏

python爬虫（四）数据存储

JSON文件存储

JSON是一种轻量级的数据交换格式，它是基于ECMAScript的一个子集
JSON采用完全独立于语言的文本格式
JSON在Python中分别由list和dict组成

JSON模块的功能

序号	函数	描述
1	json.dumps()	实现python类型转化为json字符串，返回一个str对象
2	json.loads()	把json格式的字符串转换成python类型
3	json.dump()	将python内置序列化为json对象后写入文件
4	json.load()	读取文件中json形式的字符串转换为python类型

# JSON数据的存储
import  json
s='{"name":"张三"}'

# 将字符串转成json对象
obj = json.loads(s)
print(obj,type(obj))

# 将对象转成字符串
s1 = json.dumps(obj,ensure_ascii=False)
print(type(s1))
print(s1)

# 把对象保存到文件中
json.dump(obj,open('movie.txt','w',encoding='utf-8'),ensure_ascii=False)

# 把文件中的内容读取到python程序中
obj2 = json.load(open('movie.txt',encoding='utf-8'))
print(obj2)
print(type(obj2))

爬取京东销售最好的粽子数据

def send_request():
    url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=1087591&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
    resp = requests.get(url,headers=headers)
    return resp.text

def parse_json(data):
    return data.replace('fetchJSON_comment98(','').replace(');','')

def type_change(data):
    return json.loads(data)

def save(obj):
    json.dump(obj,open('京东销售最好的粽子数据.txt','w',encoding='utf-8'),ensure_ascii=False)

def start():
    data = send_request()
    s = parse_json(data)
    obj = type_change(s)
    save(obj)
    # print(s)

if __name__ == '__main__':
    start()

CSV文件存储

CSV是Comma Separated Values 称为逗号分隔值，是一种以.csv结尾的文件

CSV文件的特点

值没有类型，所有值都是字符串
不能指点字体颜色等样式
不能指定单元格的宽高
不能合并单元格
没有多个工作表
不能嵌入图像图表

CSV文件的创建

新建Excel文件
编写数据
另存为CSV文件

向CSV文件写入数据

引入CSV模块
使用open()函数创建CSV文件
借助csv.write()函数创建writer对象
调用writer对象的writerow()方法写入一行数据
调用writer对象的writerows()方法写入多行数据

import csv
with open('student.csv','a+',newline='') as file:
    # 创建一个csv的writter对象
    writter = csv.writer(file)
    # 一次写一行数据
    writter.writerow(['麻七',19,78])
    # 一次写入多行数据
    lst = [
        ['jack',23,98],
        ['mary',22,97],
        ['lili',22,78]
    ]
    writter.writerows(lst)

从CSV文件中读取数据

引入CSV模块
使用open()函数打开CSV文件
借助csv.reader()函数创建reader对象
读到的每一行都是一个列表

import csv
with open('student.csv','r',newline='') as file:
    # 创建reader对象
    reader = csv.reader(file)
    for i in reader:
        print(i)

爬取京东粽子评论数据

import  requests
import json
import csv

def send_request():
    url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=1087591&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
    resp = requests.get(url, headers=headers)
    return resp.text

def parse_html(data):
    s = data.replace('fetchJSON_comment98(','').replace(');','')
    dict_data = json.loads(s)
    comments_list = dict_data['comments']
    lst = []
    for item in comments_list:
        content = item['content']
        creationTime = item['creationTime']
        lst.append([content,creationTime])
    save(lst)

def save(lst):
    with open('京东总总评论数据.csv','w',newline='') as file:
        writter = csv.writer(file)
        writter.writerows(lst)

def start():
    data = send_request()
    parse_html(data)

if __name__ == '__main__':
    start()

EXCEL存储数据

openpyxl模块

可以读取和写入excel文件
处理excel数据，公式，样式
在表格内插入图表

向EXCEL中写入数据

创建工作簿对象 openpyxl.Workbook()
获取活动工作表对象 wb.active
获取单元格 sheet[单元格名称]
向单元格中写入数据 cell.value = 值
向Excel中写入一行数据 sheet.append(列表)
保存Excel文件 wb.save(文件)

# 使用python向Excel文件中写入数据
import openpyxl
# 创建工作簿对象
wb = openpyxl.Workbook()
# 获取工作表
sheet = wb.active
# 获取指定的单元格
cell = sheet['A1']
# 向单元格中写数据
cell.value = '中国美丽'
# 一次写入一行数据
lst = ['姓名','年龄','成绩']
sheet.append(lst)
# 一次写入多行数据
lst2=[
    ['张三',23,98],
    ['李四',22,97],
    ['王五',20,95]
]
for row in lst2:
    sheet.append(row)
# 保存
wb.save('我的excel文件.xlsx')

从Excel文件中读取数据

加载工作簿对象 openpyxl.load_workbook(文件名)
获取活动工作表对象 wb.active
获取单元格 sheet[单元格名称]
获取单元格的值 cell.value
获取一系列格子 sheet[‘A’] ,sheet[‘3’],sheet[‘A:C’]
获取整个表格的所有行 sheet.rows()

# 从excel文件读取数据
import openpyxl
# 加载excel文件
wb = openpyxl.load_workbook('我的excel文件.xlsx')
# 获取工作表对象
# sheet = wb.active
sheet = wb['Sheet']
# 获取指定的单元格
cell = sheet['A1']
# 获取指定的单元格中的内容
value = cell.value
print(value)
# 获取一系列的格子
columns = sheet['A']
for col in columns:# 获取A列中的每一个单元格
    print(col.value)# 获取A列中的每一个单元格中的值
# print(columns)
print('--------------------')
row = sheet[3]
for cell in row: # 获取第三行的每一个单元格
    print(cell.value)# 获取第三行的每一个单元格的值
print('-----------')
cols = sheet['B:C']
for col in cols: # 获取每一列
    for cell in col:# 获取每一个单元格
        print(cell.value) # 获取每一个单元格的值
print('------------------')

爬取下厨房菜品数据

# 爬取下厨房的菜品数据
import requests
from bs4 import BeautifulSoup
import openpyxl
def send_request():
    url = 'https://www.xiachufang.com/explore/'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
    resp = requests.get(url,headers=headers)
    return resp.text
def parse_html(data):
    # 解析数据
    count = 0
    bs = BeautifulSoup(data,'lxml')
    list_name = bs.find_all('p',class_='name')
    list_catagory = bs.find_all('p',class_='ing ellipsis')
    # print(list_catagory)
    lst = []
    for i in range(len(list_name)):
        count += 1
        food_url = 'https://www.xiachufang.com'+list_name[i].find('a')['href']
        # print(food_url) lst.append([count,list_name[i].text[18:-15],list_catagory[i].text[1:-1],food_url])
    # print(lst)
    save(lst)
def save(lst):
    wb = openpyxl.Workbook()
    sheet = wb.active
    for i in lst:
        sheet.append(i)
    wb.save('下厨房美食.xlsx')
def start():
    data = send_request()
    parse_html(data)
if __name__ == '__main__':
    start()

MySql存储数据

MySql基础操作

启动MySql服务 net start mysql80
登陆MySql服务器 mysql -h127.0.0.1 -uroot -proot -P3306
关闭MySql服务 net stop mysql80

Mysql 数据类型

数值类型： int(4个字节) float(4个字节) double(8个字节)
字符串类型：char 固定长度字符串 varchar：可变长度字符串
日期/时间类型：Date:YYYY-MM-DD DateTime:YYYY-MM-DD HH:MM:SS Time:HH:MM:SS
Mysql支持在该类型关键字后面的括号内指定整数值的显示宽度(int(4)),这个显示宽度并不能限制值的范围，也不限制值的显示

SQL语言

数据定义语言（create,drop,alter等语句）
数据查询语言(select语句)
数据操纵语言（insert，delete，update语句）
数据控制语言（grant，revoke，commit，rollback等语句）
数据操纵语言针对表中的数据，数据定义语言针对数据库或表

python与mysql交互

常用操作

插入数据 insert
查询数据 select
更新数据 update

创建数据库连接

connect(host,user,passwd,database)

插入数据操作步骤

获取连接对象
获取cursor对象
编写SQL语句
执行sql语句
提交事务

import mysql.connector
# 创建连接对象
conn = mysql.connector.connect(host='localhost',user='root',password='root',database='mytestdb',auth_plugin='mysql_native_password')
print(conn)
mycursor = conn.cursor()
# 编写sql
sql = 'insert into dept(deptno,dname,loc) values(%s,%s,%s)'
val = (50,'开发部','北京')
# 执行sql
mycursor.execute(sql,val)
# 提交
conn.commit()
print(mycursor.rowcount,'记录插入成功')

批量插入数据操作步骤

获取连接对象
获取cursor对象
编写sql语句
使用列表赋值
调用executemany()执行sql语句
提交事务

import mysql.connector
# 创建连接对象
conn = mysql.connector.connect(host='localhost',user='root',password='root',database='mytestdb',auth_plugin='mysql_native_password')
# print(conn)
mycursor = conn.cursor()

# 编写sql
sql = 'insert into dept(deptno,dname,loc) values(%s,%s,%s)'
vals = [
    (60,'财务部','上海'),
    (70,'测试部','长春'),
    (80,'市场部','深圳')
]
# 执行sql
mycursor.executemany(sql,vals)
# 提交
conn.commit()
print(mycursor.rowcount,'记录插入成功')

查询操作步骤

获取连接对象
获取cursor对象
编写SQL语句
执行SQL语句
调用fetchall()方法获取返回结果，结果为列表类型
遍历列表

import mysql.connector
# 创建连接对象
conn = mysql.connector.connect(host='localhost',user='root',password='root',database='mytestdb',auth_plugin='mysql_native_password')
# print(conn)
mycursor = conn.cursor()
# 编写sql
sql = 'select * from dept;'
# 执行sql
mycursor.execute(sql)
ls = mycursor.fetchall()
print(ls)

修改、删除操作步骤

获取连接对象
获取cursor对象
编写sql
执行sql
提交事务

import mysql.connector
# 创建连接对象
conn = mysql.connector.connect(host='localhost',user='root',password='root',database='mytestdb',auth_plugin='mysql_native_password')
# print(conn)
mycursor = conn.cursor()
# 编写sql
sql = 'update dept set dname="Python" where deptno = 50;'
# sql = 'delete from dept where deptno = 80;'
# 执行sql
mycursor.execute(sql)
conn.commit()
print(mycursor.rowcount,'修改成功')
# print(mycursor.rowcount,'删除成功')

爬取链家二手房成交量

import requests
from bs4 import BeautifulSoup
import mysql.connector
# https://bj.lianjia.com/chengjiao/pg2/
class LianJiaSpider:
    mydb = mysql.connector.connect(host='localhost',user='admin',password='admin123',database='mytestdb',auth_plugin='mysql_native_password')
    mycursor = mydb.cursor()
    def __init__(self):
        self.url = 'https://bj.lianjia.com/chengjiao/pg{0}/'
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
    def send_request(self,url):
        resp = requests.get(url,headers=self.headers)
        if resp.status_code == 200:
            return resp
    def parse_html(self,resp):
        lst = []
        html = resp.text
        bs = BeautifulSoup(html,'lxml')
        ul = bs.find('ul',class_='listContent')
        li_list = ul.find_all('li')
        for item in li_list:
            title = item.find('div',class_='title').text
            houseInfo = item.find('div',class_='houseInfo').text
            dealDate = item.find('div',class_='dealDate').text
            totalPrice = item.find('div',class_='totalPrice').text
            positionInfo = item.find('div',class_='positionInfo').text
            unitPrice = item.find('div',class_='unitPrice').text
            dealHouseTxt = item.find('span',class_='dealHouseTxt')
            dealHouseTxt_txt = ''
            if dealHouseTxt != None:
                span_list = dealHouseTxt.find_all('span')
                for i in range(len(span_list)):
                    if i != 0:
                        dealHouseTxt_txt += '/'+span_list[i].text
                    else:
                        dealHouseTxt_txt += span_list[i].text
            else:
                dealHouseTxt_txt = ''
            dealCycleTxt = item.find('span',class_='dealCycleTxt')
            dealCycleTxt_list = dealCycleTxt.find_all('span')
            # dealCycleTxt = dealCycleTxt_list[0].text+'/'+dealCycleTxt_list[1].text
            agent_name = item.find('a',class_='agent_name')
            if agent_name != None:
                agent_name = agent_name.text
            else:
                agent_name = ''         lst.append((title,houseInfo,dealDate,totalPrice,positionInfo,unitPrice,dealHouseTxt_txt,dealCycleTxt_list[0].text,dealCycleTxt_list[1].text,agent_name))
            # print(agent_name)
        self.save(lst)

    def save(self,lst):
        # print(self.mydb)
        sql = 'insert into tb_lianjia(title,houseInfo,dealDate,totalPrice,positionInfo,unitPrice,dealHouseTxt,deal_money,dealcycledate,agent_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);'
        self.mycursor.executemany(sql,lst)
        self.mydb.commit()
        print(self.mycursor.rowcount,'数据插入完毕')
    def start(self):
        for i in range(1,100):
            full_url = self.url.format(i)
            resp = self.send_request(full_url)
            self.parse_html(resp)
if __name__ == '__main__':
    lianjia = LianJiaSpider()
    # lianjia.start()

MongoDB 存储数据

MongoDB简介

MongoDB是一个高性能，开源，无模式的文档型数据库。它在许多场景下用于替代传统的关系型数据库或键值对存储方式
是用C++开发，基于分布式文件存储的开源数据库系统
将数据存储为一个文档，数据结构由键值对组成
MongoDB文档类似JSON对象
字段值可以包含其他文档，数组及文档数组

MongoDB中常用的数据类型

类型	名称
Object ID	文档ID
String	字符串，最常用，必须是有效的UTF-8
Boolean	存储一个布尔值，true或false
Integer	可以是32位或64位
Double	存储浮点数
Arrays	数组或列表，多个值存储到一个键
Object	用于嵌入式的文档，即一个值为一个文档
Null	存储null值
Timestamp	时间戳
Date	存储当前日期或时间的UNIX时间格式

Object ID

每个文档都有一个属性，为_id,保证每个文档的唯一性
可以自己去设置_id插入文档
如果没有提供，那么MongoDB为每个文档提供了独特_id,类型为ObjectID
ObjectID 是一个12字节的十六进制数：
前四个字节为当前时间戳
接下来三个字节为机器id
接下来两个字节为MongoDB的服务进程id
最后三个字节是简单的增量值

MongoDB常用操作

序号	命令	描述
1	show databases;	查看已有数据库
2	use database;	选择数据库
3	show tables;	查看已有表
4	show collections;	查看已有集合
5	db.createCollection(‘表名’)	建表
6	db.集合名.drop()	删除集合
7	db.dropDatabase()	删除库

MongoDB的增删改操作

命令	描述
db.集合名.insert(document)	数据的添加
db.集合名.save(document)	如果存在就更新，不存在就添加
db.集合名.update(query,update,multi)	query:查询条件，类似where update:更新操作符,类似set multi：可选，默认是false表示只更新找到的第一条记录，值为true表示把满足条件的文档全部更新
db.集合名.remove(query)	删除数据，query为删除条件

MongoDB的查询操作

命令	描述
db.集合名.find({条件文档})	查找所有的匹配数据
db.集合名.findOne({条件文档})	只返回匹配的第一个数据
db.集合名.find().limit(number)	用于读取指定数量的文档
db.集合名.find().skip(number)	用于跳过指定数量的文档
db.集合名.find().sort(…)	参数为1位升序，参数为-1位降序
db.集合名.find({条件}).count()	用于统计结果集中文档条数
db.集合名.distinct(filed)	去重

MongoDB中的比较符号

符号	含义	示例
$eq	等于	{‘age’:20} {‘age’:{’$eq’:20}}
$lt	小于	{‘age’:{’$lt’:20}}
$gt	大于	{‘age’:{’$gt’:20}}
$lte	小于等于	{‘age’:{’$lte’:20}}
$gte	大于等于	{‘age’:{’$gte’:20}}
$ne	不等于	{‘age’:{’$ne’:20}}
$in	在范围内	{‘age’:{’$in’:[20,30]}}
$nin	不在范围内	{‘age’:{’$nin’:[20,30]}}
$or	或	{’$or’:[{},{}]}

创建数据库
use school;
创建集合(表)
db.createCollection('student');
（1）插入数据
插入一条数据
db.student.insert({'name':'张三','age':21,'gender':'男'});

插入多条数据
db.student.insert([{'name':'李四','age':21,'gender':'女'},{'name':'王五','age':20,'gender':'男'},{'name':'陈六','age':22,'gender':'男'}]);

循环插入多条数据（js写法）
for(i=20;i<=25;i++){db.student.insert({'name':'麻七','age':i})}

插入操作save
db.student.save({'_id':1,'name':'lili'});
db.student.save({'_id':2,'name':'lili'});

save与insert的区别：
如果collection中的id有相同的情况下，insert操作报错
如果collection中的id有相同的情况下，save执行更新操作
如果collection中的id没有相同的情况下，save执行新增操作

（2）修改操作
update会执行全文档更新，如果想保留原值，那么需要将所有的原值都写入
db.student.update({'name':'marry'},{'age':25,'name':'marry','gender':'男'});
如果想保留原值的第二种方式，是用$set设置
db.student.update({'name':'麻七'},{$set:{'age':27}});

第三个参数 multi 默认为false，只更新符合条件的第一条数据，改为true，就更新符合条件的所有数据
db.student.update({'name':'麻七'},{$set:{'age':27}},{'multi':true});

（3）删除操作
将姓名为lili的数据全部删除
db.student.remove({'name':'lili'});

删除符合条件的第一条数据
db.student.remove({'name':'麻七'},{'justOne':true});

将collection中所有数据进行删除
db.student.remove({});


（4） 查询操作
a:查询全部数据
db.student.find()
b:查询匹配数据
db.student.find({'name':'张三'})
c:只返回匹配的第一条数据
db.student.findOne({'name':'麻七'})
d:等值查询
db.student.find({'age':20})
db.student.find({'age':{'$eq':22}})
e:非等值查询
    查询年龄小于22
    db.student.find({'age':{'$lt':22}})
    查询年龄小于等于22
    db.student.find({'age':{'$lte':22}})
    查询年龄大于22
    db.student.find({'age':{'$gt':22}})
    查询年龄大于等于22
    db.student.find({'age':{'$gte':22}})
    查询年龄20或23的
    db.student.find({'age':{'$in':[20,23]}})
    查询年龄是20的或者姓名是麻七的人员
    db.student.find({'$or':[{'age':20},{'name':'麻七'}]})
f: 模糊匹配
    查询名字中含有麻的数据
    db.student.find({'name':/麻/})
    db.student.find({'name':{'$regex':'麻'}})

g: js 写法
    查询大于20岁的数据
    db.student.find({'$where':function(){return this.age>20}})
查询指定数量的文档
db.student.find().limit(3)
跳过3条再查3条
db.student.find().limit(3).skip(3)
排序查询
db.student.find().sort({age:1}) 1表示升序
db.student.find().sort({age:-1}) -1表示降序
查询文档个数
db.student.find().count()
db.student.find({'name':'麻七'}).count()
去重查询
db.student.distinct('age')
查询有年龄的学生信息
db.student.find({'age':{$exists:true}})  true表示含有年龄的学员
db.student.find({'age':{$exists:false}}) false表示不含有年龄的学员

pytohn与MongoDB的交互

使用步骤

导入pymongo import pymongo
连接客户端 client = pymongo.MongoClient(‘localhost’,27017)
获取student数据库 db = client.student 或 client[‘student’]
获取集合 collection = db.stu 或 collection = db[‘stu’]

增删改查

新增：s1 = {‘name’:‘张三’,‘age’:20} collection.insert_one(s1) collection.insert_many([{},{}])

import pymongo
# 连接到服务器
client = pymongo.MongoClient('localhost',27017)

# 获取要操作的数据库
# db = client.school
db = client['school']
# 获取要操作的集合
# collection = db.student
collection = db['student']

# 插入操作
# stu = {'name':'张一一','age':20,'gender':'女'}
# collection.insert_one(stu)

# 一次插入多条数据
lst = [
    {'name':'王二二','age':22},
    {'name':'张施南生','gender':'男'},
    {'name':'紫苏水','age':24}
]
collection.insert_many(lst)
print(collection)

更新：collection.update_one({‘name’:‘李’},{‘ $KaTeX parse error: Expected 'EOF', got '}' at position 16: set':{'age':20}}̲) collection.up\dots$ set’:{‘age’:20}})

import pymongo
# 连接到服务器
client = pymongo.MongoClient('localhost',27017)

# 获取要操作的数据库
# db = client.school
db = client['school']
# 获取要操作的集合
# collection = db.student
collection = db['student']

# 修改操作
# 一次修改一条数据
# collection.update_one({'name':'李四'},{'$set':{'age':20}})
# 一次修改多条数据
collection.update_many({'name':'麻七'},{'$set':{'gender':'男'}})

删除：collection.delete_one({‘name’:‘李’}) collection.delete_one({‘age’:{’$gte’:20}})

import pymongo
# 连接到服务器
client = pymongo.MongoClient('localhost',27017)

# 获取要操作的数据库
# db = client.school
db = client['school']
# 获取要操作的集合
# collection = db.student
collection = db['student']

# 删除操作
# 一次删除一条数据
# collection.delete_one({'name':'张三'})
# 一次删除多条数据
collection.delete_many({'age':20})

查询：collection.find() collection.find_one()

import pymongo
# 连接到服务器
client = pymongo.MongoClient('localhost',27017)

# 获取要操作的数据库
# db = client.school
db = client['school']
# 获取要操作的集合
# collection = db.student
collection = db['student']

# 查询全部数据
result = collection.find()
for i in result:
    print(i)

print('----------------------')
result = collection.find({'name':'麻七'})
for i in result:
    print(i)
print('----------------------')
result = collection.find({'name':{'$regex':'.*二'}})
for i in result:
    print(i)
print('----------------------')
# result = collection.find().sort('age',pymongo.ASCENDING)
result = collection.find().sort('age',pymongo.DESCENDING)
for i in result:
    print(i)
print('----------------------')
result = collection.find().sort('age',pymongo.DESCENDING).limit(3)
for i in result:
    print(i)
print('----------------------')
result = collection.find().sort('age',pymongo.DESCENDING).limit(3).skip(3)
for i in result:
    print(i)

爬取链家二手房成交量

import requests
from bs4 import BeautifulSoup
import pymongo
# https://bj.lianjia.com/chengjiao/pg2/
class LianJiaSpider:
    def __init__(self):
        self.url = 'https://bj.lianjia.com/chengjiao/pg{0}/'
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
    def send_request(self,url):
        resp = requests.get(url,headers=self.headers)
        if resp.status_code == 200:
            return resp
    def parse_html(self,resp):
        lst = []
        html = resp.text
        bs = BeautifulSoup(html,'lxml')
        ul = bs.find('ul',class_='listContent')
        li_list = ul.find_all('li')
        for item in li_list:
            title = item.find('div',class_='title').text
            houseInfo = item.find('div',class_='houseInfo').text
            dealDate = item.find('div',class_='dealDate').text
            totalPrice = item.find('div',class_='totalPrice').text
            positionInfo = item.find('div',class_='positionInfo').text
            unitPrice = item.find('div',class_='unitPrice').text
            dealHouseTxt = item.find('span',class_='dealHouseTxt')
            dealHouseTxt_txt = ''
            if dealHouseTxt != None:
                span_list = dealHouseTxt.find_all('span')
                for i in range(len(span_list)):
                    if i != 0:
                        dealHouseTxt_txt += '/'+span_list[i].text
                    else:
                        dealHouseTxt_txt += span_list[i].text
            else:
                dealHouseTxt_txt = ''
            dealCycleTxt = item.find('span',class_='dealCycleTxt')
            dealCycleTxt_list = dealCycleTxt.find_all('span')
            # dealCycleTxt = dealCycleTxt_list[0].text+'/'+dealCycleTxt_list[1].text
            agent_name = item.find('a',class_='agent_name')
            if agent_name != None:
                agent_name = agent_name.text
            else:
                agent_name = ''
            lst.append({'title':title,
                        'houseInfo':houseInfo,
                        'dealDate':dealDate,
                        'totalPrice':totalPrice,
                        'positionInfo':positionInfo,
                        'unitPrice':unitPrice,
                        'dealHouseTxt':dealHouseTxt_txt,
                        'deal_money':dealCycleTxt_list[0].text,
                        'dealcycledate':dealCycleTxt_list[1].text,
                        'agent_name':agent_name})

            # print(agent_name)
        self.save(lst)
    def save(self,lst):
        # print(self.mydb)
        client = pymongo.MongoClient('localhost', 27017)
        # 获取要操作的数据库
        db = client['lianjia']
        # 获取collection
        collection = db['collection_lianjia']
        collection.insert_many(lst)
    def start(self):
        for i in range(1,100):
            full_url = self.url.format(i)
            resp = self.send_request(full_url)
            self.parse_html(resp)
if __name__ == '__main__':
    lianjia = LianJiaSpider()
    lianjia.start()

夜色的最黑暗。

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
python爬虫（四）数据存储

python爬虫（四）数据存储JSON文件存储JSON是一种轻量级的数据交换格式，它是基于ECMAScript的一个子集JSON采用完全独立于语言的文本格式JSON在Python中分别由list和dict组成JSON模块的功能序号函数描述1json.dumps()实现python类型转化为json字符串，返回一个str对象2json.loads()把json格式的字符串转换成python类型3json.dump()将python内置序列化为jso
复制链接

扫一扫

专栏目录