Python网络爬虫--练习

最新推荐文章于 2023-06-27 18:59:12 发布

小昀小杭

最新推荐文章于 2023-06-27 18:59:12 发布

阅读量167

点赞数

文章标签： python

本文链接：https://blog.csdn.net/weixin_50267049/article/details/109648977

版权

一、爬取王者荣耀英雄信息（单页）

import json

import pymysql
import requests
from lxml import etree


def get_heros(url):
    response = requests.get(url)
    response.encoding = 'GBK'  # 王者荣耀官网编码为gbk，pycharm默认utf-8,故中文乱码
    html_etree = etree.HTML(response.text)
    return html_etree


def extract_heros(html_etree):
    heros_list = html_etree.xpath('//ul[contains(@class,"herolist clearfix")]/li/a')
    base_url = 'https:'
    heros = []
    for hero in heros_list:
        hero_img = base_url + hero.xpath('./img/@src')[0]
        hero_name = hero.xpath('./img/@alt')[0]
        hero_info = {
            'hero_img': hero_img,
            'hero_name': hero_name
        }
        heros.append(hero_info)
    return heros


def save_heros_as_json(heros):
    hero_json = json.dumps(heros, ensure_ascii=False)  # 默认以ascii解析，而中文不在ascii编码中，可以在文件中显示中文
    with open('hero.json', 'a') as w:
        w.write(hero_json)
        w.flush()


def save_heros_to_db(heros):
    conn = pymysql.Connect(host='localhost',user='root',password='6666',port=3306,database='mydb1')
    cursor = conn.cursor()
    for hero in heros:
        cursor.execute('insert into hero(heroname,heroimg) values("%s","%s")'%(hero.get('hero_name'),hero.get('hero_img')))

    cursor.close()
    conn.close()

if __name__ == '__main__':
    url = 'https://pvp.qq.com/web201605/herolist.shtml'
    html_etree = get_heros(url)
    heros = extract_heros(html_etree)
    # save_heros_as_json(heros)
    save_heros_to_db(heros)

注意点：

1.王者荣耀官方的页面编码为GBK。与pycharm默认的编码格式utf-8 不同需要将获取的页面内容 添加 response.encoding = ‘GBK’
2.json序列化默认是以ascii编码解析，而中文不在ascii 编码中，所以存储的json文件无法显示中文，添加hero_json = json.dumps(heros, ensure_ascii=False)即可显示中文
3. 有几个英雄信息无法爬取，因为其是动态加载

二、爬取糗事百科段子并存储（多页），无需登录

import json

import pymysql
import requests
from lxml import etree


def get_jokes(url):
    """输入url，输出该网页的python树形结构"""
    # 判断url是否为空
    if not  url:
        exit()
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    html_etree = etree.HTML(response.text)
    return html_etree


def extract_jokes(html_etree):
    """提取网页信息"""
    jokes_lists = html_etree.xpath('//div[contains(@class,"article block")]')  # 匹配所有笑话，列表类型
    jokes = []
    for jokes_list in jokes_lists:  # 迭代，取出每一个笑话
        author = extract_text(jokes_list.xpath('./div/a[contains(@onclick,"_hmt.push")]/h2/text()'))
        content = extract_text(jokes_list.xpath('./a/div[contains(@class,"content")]/span/text()'))
        jokes_dict = {
            'author': author,
            'content': content
        }
        jokes.append(jokes_dict)
    return jokes


def extract_text(text_list):
    """数据清洗"""
    return "".join(text_list).strip()


def save_jokes_as_json(jokes,page):
    data = {
        'status': 'ok',
        'code': 200,
        'data': jokes
    }
    data_json = json.dumps(data, ensure_ascii=False)
    with open('%d.json'%page, 'a', encoding='utf-8') as w:
        w.write(data_json)
        w.flush()

def save_jokes_to_db(jokes_list):
    for joke in jokes_list:
        # 提供服务器的认证信息，发起连接请求
        conn = pymysql.Connect(host="localhost", port=3306, user="root", password="6666", database="mydb1")

        cursor = conn.cursor()
        print(joke.get("content"))
        cursor.execute("insert into joke(author, joke_content) VALUES ('%s', '%s');" % (joke.get("author"), joke.get("content")))

        # MySQL中存在事务，事务才需要提交  pymysql默认开启了事务
        conn.commit()

        cursor.close()

        conn.close()

def get_next_url(html_etree):
    next_page_info = html_etree.xpath('//ul[contains(@class,"pagination")]/li/a/span[contains(@class,"next")]')
    if not next_page_info:
        print('最后一页')
        exit()
    base_url = 'https://www.qiushibaike.com'
    new_url = html_etree.xpath('//ul[contains(@class,"pagination")]/li[last()]/a/@href')[0]
    return base_url + new_url


if __name__ == '__main__':
    url = 'https://www.qiushibaike.com/text/'
    page = 1
    while url:
        print(url)
        html_etree = get_jokes(url)
        jokes = extract_jokes(html_etree)
        save_jokes_as_json(jokes,page)
        # save_jokes_to_db(jokes)
        new_url = get_next_url(html_etree)
        url = new_url
        page+=1

三、爬取新片厂视频信息（登录问题，爬取多页）

问题描述：新片厂视频前20页无需登录，超过二十页需要登录。

import json

import pymysql
import requests
from lxml import etree


def get_movies(url):
    headers = {
        'Cookie': 'Device_ID=5f86ec3183da2; Authorization=BD428C0D1D8684D391D8684D941D868B7F61D8683179D33964B2; _ga=GA1.2.1623768300.1602677809; _gid=GA1.2.2021033802.1602677809; UM_distinctid=1752720b8cb152-0c05a736a091e4-333376b-144000-1752720b8cc65e; PHPSESSID=3hfukfc3er7sqtcm9i0g9fh3s7; SERVER_ID=b52601c8-backend-jeatmlpn; Hm_lvt_dfbb354a7c147964edec94b42797c7ac=1602730066,1602755676,1602812945,1602836392; CNZZDATA1262268826=1592050288-1602675873-%7C1602835419; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22175270ab305301-007370f4b07bc1-333376b-1327104-175270ab3062c6%22%2C%22%24device_id%22%3A%22175270ab305301-007370f4b07bc1-333376b-1327104-175270ab3062c6%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _gat=1; channel_page=apc%3D; Hm_lpvt_dfbb354a7c147964edec94b42797c7ac=1602838892; cn_1262268826_dplus=%7B%22distinct_id%22%3A%20%221752720b8cb152-0c05a736a091e4-333376b-144000-1752720b8cc65e%22%2C%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201602838919%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201602838919%7D'
    }
    response = requests.get(url,headers=headers)
    html_etree = etree.HTML(response.text)
    return html_etree


def extract_movies(html_etree):
    movie_list = html_etree.xpath('//div[contains(@class,"channel-con")]/ul/li')
    movies = []
    for movie in movie_list:
        movie_name = movie.xpath('./div/div[contains(@class,"video-con-top")]/a/p/text()')[0]
        movie_num = movie.xpath(
            './div/div[contains(@class,"video-con-top")]/div[contains(@class,"video-view")]/span[contains(@class,"fw_300 icon")]/text()')[
            0]
        movie_img = movie.xpath('./a/img/@_src')[0]  # 图片懒加载，渲染前后属性值不同
        movie_dict = {
            'moviename': movie_name,
            'movienum': movie_num,
            'movie_img': movie_img
        }
        movies.append(movie_dict)
    return movies


def save_movies_to_db(movies):
    for movie in movies:
        conn = pymysql.Connect(host='localhost', user='root', password='6666', port=3306, database='mydb1')
        cursor = conn.cursor()
        cursor.execute('insert into movie(moviename,movienum,movieimg) values("%s","%s","%s")' % (
            movie.get('moviename'), movie.get('movienum'), movie.get('movie_img')))

        cursor.close()
        conn.close()


def save_movies_as_json(movies, page):
    data = {
        'status': 'ok',
        'code': 200,
        'movies': movies
    }
    movie_json = json.dumps(data, ensure_ascii=False)
    with open('%d.json' % page, 'a', encoding='utf-8') as w:
        w.write(movie_json)
        w.flush()


def get_next_url(html_etree):
    flag = html_etree.xpath('//div[contains(@class,"page")]/a[last()]/@title')
    base_url = 'https://www.xinpianchang.com/'
    if not flag:
        print('最后一页')
        exit()
    next_url = base_url + html_etree.xpath('//div[contains(@class,"page")]/a[last()]/@href')[0]
    return next_url


if __name__ == '__main__':
    url = 'https://www.xinpianchang.com/channel/index/sort-like?from=navigator'
    page = 1
    while url:
        html_etree = get_movies(url)
        movies = extract_movies(html_etree)
        # save_movies_to_db(movies)
        save_movies_as_json(movies, page)
        next_url = get_next_url(html_etree)
        url = next_url
        page += 1

小结：

解决需要登录问题，需要在请求头中加入登录后的cookie
但是爬取到46页，程序停止了，那是因为服务器的反爬策略（频率反爬）

小昀小杭

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python网络爬虫--练习

一、爬取王者荣耀英雄信息（单页）import jsonimport pymysqlimport requestsfrom lxml import etreedef get_heros(url): response = requests.get(url) response.encoding = 'GBK' # 王者荣耀官网编码为gbk，pycharm默认utf-8,故中文乱码 html_etree = etree.HTML(response.text) ret
复制链接

扫一扫