python快速提取网页表格

url='https://www.baidu.com/iframe/a.php?q={}&1={}&w={}'.format(code,starttime,endtime)


response = requests.get('https://www.baidu.com/iframe/a.php?q={}&1={}&w={}'.format(code,starttime,endtime), headers=headers)
tb=pd.read_html(url)
# print(tb)
tb[0].to_csv(r'数据.csv',index=True,header=false)
import hashlib, json, requests, time, re, pymysql

from lxml import etree

# TODO Mysql 配置
host = '127.0.0.1'
users = 'root'
pwd = ''
port = 3306
dbs = 'python'

def mysql(id, user_name, level, detail, create_time):
    """
    数据库的存储
    :param id: 用户id
    :param user_name: 用户姓名
    :param level: 用户等级
    :param detail: 评论内容
    :param create_time: 发布时间
    :return: None
    """
    value = ((id, user_name, level, detail, create_time))
    db = pymysql.connect(host=host, user=users, passwd=pwd, port=port, db=dbs)
    cursor = db.cursor()
    sql = "INSERT INTO mafengwo(id,user_name,level,detail,create_time) values(%s,%s,%s,%s,%s)"
    try:
        cursor.execute(sql, value)
        db.commit()
        print('success!')
    except Exception as e:
        db.rollback()
        print("error.", e)
    db.close()


def get_params(page, poi_id):
    """
    获取params参数
    :param page: 页码
    :param poi_id: 酒店id
    :return: params参数
    """
    # 创建一个md5对象
    m = hashlib.md5()
    # 将字典转换为JSON格式的字符串并传入md5对象
    ts = str(int(time.time() * 1000))
    params = {
        "_ts": ts,
        "keyword_id": "0",
        "page": f"{page}",
        "poi_id": f"{poi_id}",
        "type": "0"
    }
    salt = "c9d6618dbc657b41a66eb0af952906f1"
    m.update((json.dumps(params, separators=(',', ':')) + salt).encode())
    # 获取md5加密后的十六进制字符串
    params.update({"_sn": m.hexdigest()[2:12]})
    return params


def down(page,pid):
    """
    获取响应值的html内容
    :param page: 页码
    :param pid: 酒店id
    :return: 服务器响应回来的html标签
    """
    url = "https://www.mafengwo.cn/hotel/info/comment_list"
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
    }
    params = get_params(page, pid)
    return requests.get(url, headers=headers, params=params).json()["html"]


def func(res):
    """
    解析数据并存入数据库
    :param res: html的text格式内容
    :return: None
    """
    tree = etree.HTML(res)
    div_list = tree.xpath('//div[@class="comm-item _j_comment_item"]')
    for div in div_list:
        user_name = ''.join(div.xpath('div[@class="user"]/a[@class="name"]/text()'))
        level = ''.join(div.xpath('div[@class="user"]/a[@class="LV"]/text()'))
        id = ''.join(re.findall("u=(.*?)&", div.xpath('div[@class="user"]/a[@class="avatar"]/@href')[0])) if \
            div.xpath('div[@class="user"]/a[@class="avatar"]/@href')[0] != 'javascript:;' else ""
        detail = ''.join(div.xpath('div[@class="txt"]/text()')).replace('\U0001f31f', '')
        create_time = ''.join(div.xpath("div[@class='comm-meta']/span[@class='time']/text()"))
        mysql(id, user_name, level, detail, create_time)


if __name__ == '__main__':
    res = down(1,7091472)
    func(res)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

f4ck3sdn

初心:希望为更多人知识解惑

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值