注释:本文仅用于技术学习
1、分享链接存入缓存redis
2、python读取缓存队列信息
3、访问页面解析页面的值
4、存入mysql数据库
redis缓存如图事先存入队列,数据库存储结果
数据库结构
CREATE TABLE `tj_douyin` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`dy_id` varchar(50) DEFAULT NULL,
`dy_name` varchar(100) DEFAULT NULL COMMENT '名字',
`focus` int(11) DEFAULT NULL COMMENT '关注数',
`follower` int(11) DEFAULT NULL COMMENT '粉丝数',
`liked` int(11) DEFAULT NULL COMMENT '赞数量',
`works` int(11) DEFAULT NULL COMMENT '作品数',
`like` int(11) DEFAULT NULL COMMENT '喜欢数',
`datetime` bigint(11) NOT NULL COMMENT '写入时间',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=23 DEFAULT CHARSET=utf8;
代码部分:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Date : 2019/10/29
import requests
import pymysql
import time
import redis
from bs4 import BeautifulSoup as bs
# unicode对应的数字
change = {
('\ue602', '\ue60E', '\ue618'): '1',
('\ue603', '\ue60d', '\ue616'): '0',
('\ue604', '\ue611', '\ue61a'): '3',
('\ue605', '\ue610', '\ue617'): '2',
('\ue606', '\ue60c', '\ue619'): '4',
('\ue607', '\ue60f', '\ue61b'): '5',
('\ue608', '\ue612', '\ue61f'): '6',
('\ue609', '\ue615', '\ue61e'): '9',
('\ue60a', '\ue613', '\ue61c'): '7',
('\ue60b', '\ue614', '\ue61d'): '8'
}
"""
change = {
'\ue602': '1', '\ue60E': '1', '\ue618': '1',
'\ue603': '0', '\ue60d': '0', '\ue616': '0',
'\ue604': '3', '\ue611': '3', '\ue61a': '3',
'\ue605': '2', '\ue610': '2', '\ue617': '2',
'\ue606': '4', '\ue60c': '4', '\ue619': '4',
'\ue607': '5', '\ue60f': '5', '\ue61b': '5',
'\ue608': '6', '\ue612': '6', '\ue61f': '6',
'\ue609': '9', '\ue615': '9', '\ue61e': '9',
'\ue60a': '7', '\ue613': '7', '\ue61c': '7',
'\ue60b': '8', '\ue614': '8', '\ue61d': '8'
}
"""
cites_codes = []
Team_name = 'dou_yin' # redis队列名
soup = ''
# 将爬到的单个unicode编码放到这个函数会返回对应的数字
def change_2_num(code):
for i in change:
try:
if code.split()[0] in i:
return change[i]
except:
print('函数change_2_num出错', code.split())
return code
# 请求链接,返回soup对象
def get_html(url):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60"}
r = requests.get(url, headers=headers, verify=False)
if not len(r.text) > 10000:
return
soup = bs(r.text, 'lxml')
return soup
def get_num(name, attrs):
fin = ''
res = soup.find_all(name=name, attrs={'class': attrs})
if res:
mid = res[0].text.split()
# 将获取的文本以空格切成列表,不切的话因为空格的存在会返回''而不是uincode编码
else:
return
for code in mid:
fin += change_2_num(code)
# 遍历文本内容,如果是unicode编码则返回对应数字
# print(fin)
return fin
# 存储到数据库函数
def store_2_mysql():
# 连接数据库改成自己的新建一个数据库加表
conn = pymysql.connect(host='127.0.0.1', user='root', password='', database='test_tianqi', port=3306)
cursor = conn.cursor()
# 遍历列表存入数据库
for i in cites_codes:
sql = '''
insert into tj_douyin values(0,'%s','%s','%s','%s','%s','%s','%s','%s')''' % (
i[0], i[1], i[2], i[3], i[4], i[5], i[6], int(time.time()))
cursor.execute(sql)
# 提交
conn.commit()
conn.close()
# 查询数据
def select_redis():
# 连接redis
con = redis.Redis(
host='127.0.0.1',
port=6379,
db=1, # redis库顺序
decode_responses=True # 设置为True返回的数据格式就是时str类型
)
length = con.llen(Team_name) # 查看队列长度
if length > 0:
val = con.lpop(Team_name) # 取出第一个队列
else:
val = ''
return val
def main():
global soup
soup = get_html(url)
if not soup:
return
try:
nickname = soup.find(name='p', attrs={'class': 'nickname'}).string # 获取html内容名字
# print(nickname)
signature = soup.find(name='p', attrs={'class': 'signature'}).string
# print(signature)
dyID = get_num('p', "shortid").replace('抖音ID:', '')
focus = get_num('span', "focus").replace('关注', '')
follower = get_num('span', "follower").replace('粉丝', '')
liked = get_num('span', "liked-num").replace('赞', '')
works = get_num('div', "user-tab").replace('作品', '')
like = get_num('div', "like-tab").replace('喜欢', '')
# 拼接字典
city_code = [dyID, nickname, focus, follower, liked, works, like]
cites_codes.append(city_code)
store_2_mysql() # 调用数据库参数
except:
return
if __name__ == '__main__':
url = select_redis()
main()
代码部分借鉴:
https://www.cnblogs.com/byadmin/p/11441137.html