python 爬取抖音个人分享页面信息保存到数据库

最新推荐文章于 2024-08-04 22:15:00 发布

天下皆白_唯我独黑

最新推荐文章于 2024-08-04 22:15:00 发布

阅读量1.8k

点赞数 2

分类专栏： Python 爬虫文章标签： python 爬虫抖音个人主页

本文链接：https://blog.csdn.net/qq_24909089/article/details/102797302

版权

Python 同时被 2 个专栏收录

39 篇文章 1 订阅

订阅专栏

爬虫

5 篇文章 0 订阅

订阅专栏

注释：本文仅用于技术学习

1、分享链接存入缓存redis

2、python读取缓存队列信息

3、访问页面解析页面的值

4、存入mysql数据库

redis缓存如图事先存入队列，数据库存储结果

数据库结构

CREATE TABLE `tj_douyin` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `dy_id` varchar(50) DEFAULT NULL,
  `dy_name` varchar(100) DEFAULT NULL COMMENT '名字',
  `focus` int(11) DEFAULT NULL COMMENT '关注数',
  `follower` int(11) DEFAULT NULL COMMENT '粉丝数',
  `liked` int(11) DEFAULT NULL COMMENT '赞数量',
  `works` int(11) DEFAULT NULL COMMENT '作品数',
  `like` int(11) DEFAULT NULL COMMENT '喜欢数',
  `datetime` bigint(11) NOT NULL COMMENT '写入时间',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=23 DEFAULT CHARSET=utf8;

代码部分:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Date  : 2019/10/29
import requests
import pymysql
import time
import redis

from bs4 import BeautifulSoup as bs

# unicode对应的数字
change = {
    ('\ue602', '\ue60E', '\ue618'): '1',
    ('\ue603', '\ue60d', '\ue616'): '0',
    ('\ue604', '\ue611', '\ue61a'): '3',
    ('\ue605', '\ue610', '\ue617'): '2',
    ('\ue606', '\ue60c', '\ue619'): '4',
    ('\ue607', '\ue60f', '\ue61b'): '5',
    ('\ue608', '\ue612', '\ue61f'): '6',
    ('\ue609', '\ue615', '\ue61e'): '9',
    ('\ue60a', '\ue613', '\ue61c'): '7',
    ('\ue60b', '\ue614', '\ue61d'): '8'
}

"""
change = {
    '\ue602': '1', '\ue60E': '1', '\ue618': '1',
    '\ue603': '0', '\ue60d': '0', '\ue616': '0',
    '\ue604': '3', '\ue611': '3', '\ue61a': '3',
    '\ue605': '2', '\ue610': '2', '\ue617': '2',
    '\ue606': '4', '\ue60c': '4', '\ue619': '4',
    '\ue607': '5', '\ue60f': '5', '\ue61b': '5',
    '\ue608': '6', '\ue612': '6', '\ue61f': '6',
    '\ue609': '9', '\ue615': '9', '\ue61e': '9',
    '\ue60a': '7', '\ue613': '7', '\ue61c': '7',
    '\ue60b': '8', '\ue614': '8', '\ue61d': '8'
}
"""

cites_codes = []
Team_name = 'dou_yin'  # redis队列名
soup = ''


# 将爬到的单个unicode编码放到这个函数会返回对应的数字
def change_2_num(code):
    for i in change:
        try:
            if code.split()[0] in i:
                return change[i]
        except:
            print('函数change_2_num出错', code.split())
    return code


# 请求链接，返回soup对象
def get_html(url):
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60"}
    r = requests.get(url, headers=headers, verify=False)
    if not len(r.text) > 10000:
        return
    soup = bs(r.text, 'lxml')
    return soup


def get_num(name, attrs):
    fin = ''
    res = soup.find_all(name=name, attrs={'class': attrs})
    if res:
        mid = res[0].text.split()
        # 将获取的文本以空格切成列表，不切的话因为空格的存在会返回''而不是uincode编码
    else:
        return
    for code in mid:
        fin += change_2_num(code)
        # 遍历文本内容，如果是unicode编码则返回对应数字
    # print(fin)
    return fin


# 存储到数据库函数
def store_2_mysql():
    # 连接数据库改成自己的新建一个数据库加表
    conn = pymysql.connect(host='127.0.0.1', user='root', password='', database='test_tianqi', port=3306)
    cursor = conn.cursor()
    # 遍历列表存入数据库
    for i in cites_codes:
        sql = '''
            insert into tj_douyin values(0,'%s','%s','%s','%s','%s','%s','%s','%s')''' % (
            i[0], i[1], i[2], i[3], i[4], i[5], i[6], int(time.time()))
        cursor.execute(sql)
    # 提交
    conn.commit()
    conn.close()


# 查询数据
def select_redis():
    # 连接redis
    con = redis.Redis(
        host='127.0.0.1',
        port=6379,
        db=1,  # redis库顺序
        decode_responses=True  # 设置为True返回的数据格式就是时str类型
    )

    length = con.llen(Team_name)  # 查看队列长度

    if length > 0:
        val = con.lpop(Team_name)  # 取出第一个队列
    else:
        val = ''

    return val


def main():
    global soup
    soup = get_html(url)
    if not soup:
        return
    try:

        nickname = soup.find(name='p', attrs={'class': 'nickname'}).string  # 获取html内容名字
        # print(nickname)
        signature = soup.find(name='p', attrs={'class': 'signature'}).string
        # print(signature)
        dyID = get_num('p', "shortid").replace('抖音ID：', '')
        focus = get_num('span', "focus").replace('关注', '')
        follower = get_num('span', "follower").replace('粉丝', '')
        liked = get_num('span', "liked-num").replace('赞', '')
        works = get_num('div', "user-tab").replace('作品', '')
        like = get_num('div', "like-tab").replace('喜欢', '')

        # 拼接字典
        city_code = [dyID, nickname, focus, follower, liked, works, like]
        cites_codes.append(city_code)
        store_2_mysql()  # 调用数据库参数
    except:
        return


if __name__ == '__main__':
    url = select_redis()
    main()

代码部分借鉴：

https://www.cnblogs.com/byadmin/p/11441137.html