10月29日学习总结

10月29日学习总结

一、利用Python压缩和解压缩

数据压缩
import gzip

import requests

resp = requests.get('http://www.sohu.com')
with open('sohu_index.html', 'w', encoding='utf-8') as file1:
    file1.write(resp.text)
    with open('sohu_index.html.gz', 'wb') as file2:
        # compress函数的第一个参数是要压缩的二进制数据
        # 第二个参数代表压缩级别,0-速度最快,9-压缩比最高
        zipped_data = gzip.compress(resp.text.encode('utf-8'), 9)
        file2.write(zipped_data)
解压缩数据
import gzip

with open('sohu_index.html.gz', 'rb') as file:
    unzipped_data = gzip.decompress(file.read())
    print(unzipped_data.decode('utf-8'))

二、给数据生成签名(signature)/指纹(fingerprint)/摘要(digest)

MD5 / SHA-1 / SHA-256 ---> 签名算法(哈希函数)
特点:不同的对象几乎不可能产生相同的签名(每个对象的摘要都是独一无二的)

对象 ---> MD5 ---> 128bit ---> 32个十六进制的字符
对象 ---> SHA256 ---> 256bit ---> 64个十六进制字符

hashlib ---> md5 / sha1 / sha256 ---> hexdigest()
"""
import hashlib

hasher = hashlib.md5()
with open('python-3.9.6-amd64.exe', 'rb') as file:
    data = file.read(512)
    while data:
        hasher.update(data)
        data = file.read(512)
    # for data in iter(lambda: file.read(512), b''):
    #     hasher.update(data)
print(hasher.hexdigest())
给豆瓣top250电影数据生成签名和利用多线程提升代码运行速度
import hashlib
import time
import random
from functools import wraps
from concurrent.futures import ThreadPoolExecutor

import bs4
import openpyxl
import requests


# 加延迟装饰器避免爬取数据太快导致IP被封
def random_delay(*, min_delay, max_delay):

    def decorate(func):

        @wraps(func)
        def wrapper(*args, **kwargs):
            duration = random.random() * (max_delay - min_delay) + min_delay
            time.sleep(duration)
            result = func(*args, **kwargs)
            return result

        return wrapper

    return decorate


# fetch_movie_detail = random_delay(min_delay=0.5, max_delay=6.5)(fetch_movie_detail)
@random_delay(min_delay=0.5, max_delay=6.5)
def fetch_movie_detail(session, url):
    """从电影详情页获取信息
    :param session: 会话
    :param url: 电影详情页URL
    :return: 四元组,包含电影的类型、国家、语言、时长
    """
    resp = session.get(url=url)
    print(resp.status_code)
    if resp.status_code == 200:
        soup = bs4.BeautifulSoup(resp.text, 'html.parser')
        genre_spans = soup.select('#info > span[property="v:genre"]')
        genre = '/ '.join([span.text for span in genre_spans])
        last_genre_span = genre_spans[-1]  # type: bs4.Tag
        country_span = last_genre_span.find_next_sibling('span')
        country = str(country_span.next_sibling).strip()
        language_span = country_span.find_next_sibling('span')
        language = str(language_span.next_sibling).strip()
        runtime = soup.select_one('#info > span[property="v:runtime"]').text
        return genre, country, language, runtime


def initialize_session(session):
    """初始化Session对象"""
    session.headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
    }
    session.proxies = {
        'http': 'socks5://127.0.0.1:1086',
        'https': 'socks5://127.0.0.1:1086'
    }


def generate_signature(title, rating, runtime):
    """根据选取的字段生成签名"""
    hasher = hashlib.md5()
    hasher.update(title.encode())
    hasher.update(rating.encode())
    hasher.update(runtime.encode())
    return hasher.hexdigest()


def main():
    # 创建Session对象(会话)
    session = requests.Session()
    initialize_session(session)
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = 'Top250'
    sheet.append(('标题', '评分', '名言', '类型', '制片国', '语言', '时长', '指纹'))
    with ThreadPoolExecutor(max_workers=32) as pool:
        for page in range(10):
            # 通过给Session对象发get消息实现GET请求
            resp = session.get(f'https://movie.douban.com/top250?start={page * 25}')
            print(resp.status_code)
            if resp.status_code == 200:
                soup = bs4.BeautifulSoup(resp.text, 'html.parser')
                divs_list = soup.select('div.info')
                for div in divs_list:  # type: bs4.Tag
                    pool.submit(parse_page, div, session, sheet)
            time.sleep(5)
    workbook.save('豆瓣电影数据.xlsx')


def parse_page(div, session, sheet):
    detail_url = div.select_one('div.hd > a').attrs['href']
    title = div.select_one('div.hd > a > span.title').text
    rating = div.select_one('div.bd > div > span.rating_num').text
    motto_span = div.select_one('div.bd > p.quote > span')
    motto = motto_span.text if motto_span else '~~~~~'
    # 如果需要取消装饰器,可以用fetch_movie_detail.__wrapped__(...)这样的写法
    genre, country, language, runtime = fetch_movie_detail(session, detail_url)
    signature = generate_signature(title, rating, runtime)
    sheet.append((title, rating, motto, genre, country, language, runtime, signature))


if __name__ == '__main__':
    main()

请添加图片描述

三、多线程和多进程的比较

import concurrent.futures
import time

PRIMES = [
    1116281,
    1297337,
    104395303,
    472882027,
    533000389,
    817504243,
    982451653,
    112272535095293,
    112582705942171,
    112272535095293,
    115280095190773,
    115797848077099,
    1099726899285419
] * 5


def is_prime(n):
    """判断素数"""
    for i in range(2, int(n ** 0.5) + 1):
        if n % i == 0:
            return False
    return n != 1


def main():
    """主函数"""
    start = time.time()
    with concurrent.futures.ProcessPoolExecutor(max_workers=16) as pool:
        for number, prime in zip(PRIMES, pool.map(is_prime, PRIMES)):
            print('%d is prime: %s' % (number, prime))
    end = time.time()
    print(f'总时间为{end-start}秒')


if __name__ == '__main__':
    main()

并发 / 并行性 —> 多任务在“同时”进行

​ ~ 多线程 —> GIL —> 无法使用多核特性

​ ~多进程 —>启动多个Python解释器来执行一个代码—>可以利用CPU的多核特性

​ —> 一般写爬虫不适用多进程,因为爬虫是I/O密集性任务

​ —>对于计算密集型任务,我们一般才会选择使用多进程

​ ~异步编程(异步I/O)—> I/O密集型任务

​ —>同步(synchronous)—>按照一定的顺序排队执行—>数据同步

​ —>异步(asynchronous)—>没有一定的执行顺序(无序)

​ —>阻塞—>一件事没有做完必须等待,直到完成才能做别的事情

​ —>非阻塞—>一件事情没有完成的时候可以去做别的事情,通常事情完成后会收到通知,需要执行对应的处理。

四、什么叫迭代器

"""
example06   -   什么叫迭代器 ---> 实现了迭代器协议的对象
迭代器协议 ---> 两个魔术方法
~ __iter__  ---> 返回迭代器对象
~ __next__  ---> 从迭代器获取下一个迭代值

1 1 2 3 5 8 13 21 34 55...
"""

class FibIter:

    def __init__(self, max_count):
        self.a, self.b = 0, 1
        self.cur_count = 0
        self.max_count = max_count

    def __iter__(self):
        return self

    def __next__(self):
        if self.cur_count < self.max_count:
            self.a, self.b = self.b, self.a + self.b
            self.cur_count += 1
            return self.a
        raise StopIteration()


obj = FibIter(20)
print(next(obj))
print(next(obj))
print(next(obj))
print('-' * 13)
for value in obj:
    print(value)

请添加图片描述

五、什么叫生成器

"""
example07   -   什么叫生成器 ---> 迭代器的语法升级简化版本
"""


def fib(max_count):
    a, b = 0, 1
    for _ in range(max_count):
        a, b = b, a + b
        yield a


# 调用函数不是得到返回值而是创建了一个生成器对象
obj = fib(20)
print(next(obj))
print(next(obj))
print(next(obj))
print(next(obj))
print(next(obj))
print('-' * 13)
for value in obj:
    print(value)

请添加图片描述

六、协程

"""
example08   -   协程 ---> co-routine ---> 相互写作的子程序

生成器经过预激活就可以成为协程(跟其他子程序进行协作 ---> 协作式并发)
"""


def calc_average():
    total, counter = 0, 0
    avg_value = None
    while True:
        curr_value = yield avg_value
        total += curr_value
        counter += 1
        avg_value = total / counter


def main():
    obj = calc_average()
    obj.send(None)
    for _ in range(5):
        print(obj.send(float(input())))


if __name__ == '__main__':
    main()

请添加图片描述


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值