10月29日学习总结
一、利用Python压缩和解压缩
数据压缩
import gzip
import requests
resp = requests.get('http://www.sohu.com')
with open('sohu_index.html', 'w', encoding='utf-8') as file1:
file1.write(resp.text)
with open('sohu_index.html.gz', 'wb') as file2:
# compress函数的第一个参数是要压缩的二进制数据
# 第二个参数代表压缩级别,0-速度最快,9-压缩比最高
zipped_data = gzip.compress(resp.text.encode('utf-8'), 9)
file2.write(zipped_data)
解压缩数据
import gzip
with open('sohu_index.html.gz', 'rb') as file:
unzipped_data = gzip.decompress(file.read())
print(unzipped_data.decode('utf-8'))
二、给数据生成签名(signature)/指纹(fingerprint)/摘要(digest)
MD5 / SHA-1 / SHA-256 ---> 签名算法(哈希函数)
特点:不同的对象几乎不可能产生相同的签名(每个对象的摘要都是独一无二的)
对象 ---> MD5 ---> 128bit ---> 32个十六进制的字符
对象 ---> SHA256 ---> 256bit ---> 64个十六进制字符
hashlib ---> md5 / sha1 / sha256 ---> hexdigest()
"""
import hashlib
hasher = hashlib.md5()
with open('python-3.9.6-amd64.exe', 'rb') as file:
data = file.read(512)
while data:
hasher.update(data)
data = file.read(512)
# for data in iter(lambda: file.read(512), b''):
# hasher.update(data)
print(hasher.hexdigest())
给豆瓣top250电影数据生成签名和利用多线程提升代码运行速度
import hashlib
import time
import random
from functools import wraps
from concurrent.futures import ThreadPoolExecutor
import bs4
import openpyxl
import requests
# 加延迟装饰器避免爬取数据太快导致IP被封
def random_delay(*, min_delay, max_delay):
def decorate(func):
@wraps(func)
def wrapper(*args, **kwargs):
duration = random.random() * (max_delay - min_delay) + min_delay
time.sleep(duration)
result = func(*args, **kwargs)
return result
return wrapper
return decorate
# fetch_movie_detail = random_delay(min_delay=0.5, max_delay=6.5)(fetch_movie_detail)
@random_delay(min_delay=0.5, max_delay=6.5)
def fetch_movie_detail(session, url):
"""从电影详情页获取信息
:param session: 会话
:param url: 电影详情页URL
:return: 四元组,包含电影的类型、国家、语言、时长
"""
resp = session.get(url=url)
print(resp.status_code)
if resp.status_code == 200:
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
genre_spans = soup.select('#info > span[property="v:genre"]')
genre = '/ '.join([span.text for span in genre_spans])
last_genre_span = genre_spans[-1] # type: bs4.Tag
country_span = last_genre_span.find_next_sibling('span')
country = str(country_span.next_sibling).strip()
language_span = country_span.find_next_sibling('span')
language = str(language_span.next_sibling).strip()
runtime = soup.select_one('#info > span[property="v:runtime"]').text
return genre, country, language, runtime
def initialize_session(session):
"""初始化Session对象"""
session.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
}
session.proxies = {
'http': 'socks5://127.0.0.1:1086',
'https': 'socks5://127.0.0.1:1086'
}
def generate_signature(title, rating, runtime):
"""根据选取的字段生成签名"""
hasher = hashlib.md5()
hasher.update(title.encode())
hasher.update(rating.encode())
hasher.update(runtime.encode())
return hasher.hexdigest()
def main():
# 创建Session对象(会话)
session = requests.Session()
initialize_session(session)
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = 'Top250'
sheet.append(('标题', '评分', '名言', '类型', '制片国', '语言', '时长', '指纹'))
with ThreadPoolExecutor(max_workers=32) as pool:
for page in range(10):
# 通过给Session对象发get消息实现GET请求
resp = session.get(f'https://movie.douban.com/top250?start={page * 25}')
print(resp.status_code)
if resp.status_code == 200:
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
divs_list = soup.select('div.info')
for div in divs_list: # type: bs4.Tag
pool.submit(parse_page, div, session, sheet)
time.sleep(5)
workbook.save('豆瓣电影数据.xlsx')
def parse_page(div, session, sheet):
detail_url = div.select_one('div.hd > a').attrs['href']
title = div.select_one('div.hd > a > span.title').text
rating = div.select_one('div.bd > div > span.rating_num').text
motto_span = div.select_one('div.bd > p.quote > span')
motto = motto_span.text if motto_span else '~~~~~'
# 如果需要取消装饰器,可以用fetch_movie_detail.__wrapped__(...)这样的写法
genre, country, language, runtime = fetch_movie_detail(session, detail_url)
signature = generate_signature(title, rating, runtime)
sheet.append((title, rating, motto, genre, country, language, runtime, signature))
if __name__ == '__main__':
main()
三、多线程和多进程的比较
import concurrent.futures
import time
PRIMES = [
1116281,
1297337,
104395303,
472882027,
533000389,
817504243,
982451653,
112272535095293,
112582705942171,
112272535095293,
115280095190773,
115797848077099,
1099726899285419
] * 5
def is_prime(n):
"""判断素数"""
for i in range(2, int(n ** 0.5) + 1):
if n % i == 0:
return False
return n != 1
def main():
"""主函数"""
start = time.time()
with concurrent.futures.ProcessPoolExecutor(max_workers=16) as pool:
for number, prime in zip(PRIMES, pool.map(is_prime, PRIMES)):
print('%d is prime: %s' % (number, prime))
end = time.time()
print(f'总时间为{end-start}秒')
if __name__ == '__main__':
main()
并发 / 并行性 —> 多任务在“同时”进行
~ 多线程 —> GIL —> 无法使用多核特性
~多进程 —>启动多个Python解释器来执行一个代码—>可以利用CPU的多核特性
—> 一般写爬虫不适用多进程,因为爬虫是I/O密集性任务
—>对于计算密集型任务,我们一般才会选择使用多进程
~异步编程(异步I/O)—> I/O密集型任务
—>同步(synchronous)—>按照一定的顺序排队执行—>数据同步
—>异步(asynchronous)—>没有一定的执行顺序(无序)
—>阻塞—>一件事没有做完必须等待,直到完成才能做别的事情
—>非阻塞—>一件事情没有完成的时候可以去做别的事情,通常事情完成后会收到通知,需要执行对应的处理。
四、什么叫迭代器
"""
example06 - 什么叫迭代器 ---> 实现了迭代器协议的对象
迭代器协议 ---> 两个魔术方法
~ __iter__ ---> 返回迭代器对象
~ __next__ ---> 从迭代器获取下一个迭代值
1 1 2 3 5 8 13 21 34 55...
"""
class FibIter:
def __init__(self, max_count):
self.a, self.b = 0, 1
self.cur_count = 0
self.max_count = max_count
def __iter__(self):
return self
def __next__(self):
if self.cur_count < self.max_count:
self.a, self.b = self.b, self.a + self.b
self.cur_count += 1
return self.a
raise StopIteration()
obj = FibIter(20)
print(next(obj))
print(next(obj))
print(next(obj))
print('-' * 13)
for value in obj:
print(value)
五、什么叫生成器
"""
example07 - 什么叫生成器 ---> 迭代器的语法升级简化版本
"""
def fib(max_count):
a, b = 0, 1
for _ in range(max_count):
a, b = b, a + b
yield a
# 调用函数不是得到返回值而是创建了一个生成器对象
obj = fib(20)
print(next(obj))
print(next(obj))
print(next(obj))
print(next(obj))
print(next(obj))
print('-' * 13)
for value in obj:
print(value)
六、协程
"""
example08 - 协程 ---> co-routine ---> 相互写作的子程序
生成器经过预激活就可以成为协程(跟其他子程序进行协作 ---> 协作式并发)
"""
def calc_average():
total, counter = 0, 0
avg_value = None
while True:
curr_value = yield avg_value
total += curr_value
counter += 1
avg_value = total / counter
def main():
obj = calc_average()
obj.send(None)
for _ in range(5):
print(obj.send(float(input())))
if __name__ == '__main__':
main()