- 数据库存储
- 关系型数据库存储 ,比如mysql
- 非关系型数据库存储,比如 mongodb
1,使用csv格式存储数据
def save_to_csv(book_list):
'''保存列表数据到csv文件中
params:book_list:要保存的列表数据
return:None
'''
file = open('dangdang.csv','w',newline='',encoding='utf-8')
# 设置表头
headers = ['name','author','price']
write = csv.DictWriter(file,headers)
# 准备表头
write.writeheader()
# 1, 写行--一次写多行(也可以用遍历方式)
# write.writerows(book_list)
#2, 遍历方式
for book in book_list:
write.writerow(book)
file.close()
2,使用mysql存储数据
登录数据库:mysql -uroot -p
查看数据库:show databases;
删除某个数据库:drop database test1;
创建一个数据库:create database booksdb charset=utf8;
使用数据库:use booksdb; (重要)
创建表: create table tb_books(id int primary key auto_increment,name varchar(200),price decimal(5,1),author varchar(200));
查看表的结构: desc tb_books; (描述一下表的结构)
查询表数据:select * from tb_books;
查询表指定的数据:select name from tb_books;
退出:exit
3,.python与mysql交互
安装及导入pymysql模块
创建一个Connection连接对象
由连接对象 创建游标
通过游标执行sql
通过游标获取返回的结果
关闭游标,连接
import pymysql
def save_to_mysql(book_list):
'''
保存数据到mysql中
:param book_list: 要保存的列表数据
:return: None
'''
"""实现步骤:
1,导入 pymysql
2,创建连接 Connection
3,由Connection对象取得游标Cursor
4,使用Cursor执行sql语句
5,通过Cursor获取结果集
6,释放资源(Cursor/Connection)
"""
"""连接参数:
user=None,
password="",
host=None,
database=None,
unix_socket=None,
port=0,
charset="",
"""
conn = pymysql.Connection(user='root',password='123456',host='127.0.0.1',
port=3306,database='booksdb',charset='utf8')
cur = conn.cursor()
try:
for item in book_list:
sql_str = 'insert into tb_books(name,author,price) values (%s,%s,%s)'
params = (item['name'],item['author'],item['price'][1:])
cur.execute(sql_str,params)
conn.commit() #事务提交
except Exception as e:
print(e)
conn.rollback() #事物回滚(出了事就滚回原来的地方去)
finally:
cur.close()
conn.close()
4,Requests高级
实现 访问状态,连接状态 的保持
服务端可以依据cookie的信息,判断你是 用游览器 访问的,还是 爬虫程序来访问
- 掌握 headers中携带cookie - 掌握 cookies参数的使用 - 掌握 cookieJar的转换方法
4.1 在headers参数中携带cookie+cookies参数的使用-字典(反爬手段1)
带上cookie的好处:
能够访问登录后的页面
正常的浏览器在请求服务器的时候会带上cookie(第一次请求除外),所以对方服务器有可能会通过是否携带cookie来判断我们是否是一个爬虫,对应的能起到一定的反爬效果
带上cookie的坏处:
一套cookie往往对应的是一个用户的信息,请求太频繁有更大可能性被对方识别为爬虫
那么,面对这种情况如何解决---->使用多个账号
import requests
url = 'https://movie.douban.com/'
# 【cookie设置方式一:设置请求头方式】
# 构造请求头字典
# headers = {
# # 从浏览器中复制过来的User-Agent
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.82',
# # 从浏览器中复制过来的Cookie字符
# 'Cookie': 'bid=RfS38MB7LV8; ll="108309"; _pk_id.100001.4cf6=8d60d63b4c9733e1.1688615822.; __yadk_uid=5BZowsDNUnjI9443kNVPdCDPxyW6bqvZ; _vwo_uuid_v2=DC4565862FDAAFB21D9B5778FCBFE7037|77eb2c86622a70a921de5785bedfd468; __utmz=30149280.1689693957.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmz=223695111.1689693957.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1689738882%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D7UmSxW7mLU4NRvxHE9Q0M1QLIYdUCddwIs56ny6doL19jk2PAEDBEixYj5FlnsFF%26wd%3D%26eqid%3De86882b2000010570000000664a63b7e%22%5D; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.282734834.1689693957.1689693957.1689738883.2; __utmb=30149280.0.10.1689738883; __utmc=30149280; __utma=223695111.1316880856.1689693957.1689693957.1689738883.2; __utmb=223695111.0.10.1689738883; __utmc=223695111; __gads=ID=71cbac3fcc305d63-22b3a0f51cdf00e4:T=1681650656:RT=1689738896:S=ALNI_MZJXg8nMxqoktIYpF8AfaQEW7IhAg; __gpi=UID=00000bf61bdb657d:T=1681650656:RT=1689738896:S=ALNI_MYPcnEorGFmTBwywHc4et_PtqinqA'
# }
#
# # 请求头参数字典中携带cookie字符串
# resp = requests.get(url, headers=headers)
#
# print(resp.text) # 在输出结果中,看到有用户信息,说明是获取登录之后的信息
# 【cookie设置方式二:字典传参】
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.82'
}
# cookie字符串形式
cookie_str = 'bid=RfS38MB7LV8; ll="108309"; _pk_id.100001.4cf6=8d60d63b4c9733e1.1688615822.; __yadk_uid=5BZowsDNUnjI9443kNVPdCDPxyW6bqvZ; _vwo_uuid_v2=DC4565862FDAAFB21D9B5778FCBFE7037|77eb2c86622a70a921de5785bedfd468; __utmz=30149280.1689693957.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmz=223695111.1689693957.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1689738882%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D7UmSxW7mLU4NRvxHE9Q0M1QLIYdUCddwIs56ny6doL19jk2PAEDBEixYj5FlnsFF%26wd%3D%26eqid%3De86882b2000010570000000664a63b7e%22%5D; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.282734834.1689693957.1689693957.1689738883.2; __utmb=30149280.0.10.1689738883; __utmc=30149280; __utma=223695111.1316880856.1689693957.1689693957.1689738883.2; __utmb=223695111.0.10.1689738883; __utmc=223695111; __gads=ID=71cbac3fcc305d63-22b3a0f51cdf00e4:T=1681650656:RT=1689738896:S=ALNI_MZJXg8nMxqoktIYpF8AfaQEW7IhAg; __gpi=UID=00000bf61bdb657d:T=1681650656:RT=1689738896:S=ALNI_MYPcnEorGFmTBwywHc4et_PtqinqA'
# cookie字典形式(字典推导式)
cookie_dict = {cookie.split('=')[0]:cookie.split('=')[-1] for cookie in cookie_str.split('; ')}
# print(cookie_dict)
response = requests.get(url=url,headers=headers,cookies=cookie_dict)
print(response.text)