1、requests模块
import requests
from fake_useragent import UserAgent
headers={"Referer":"http://www.angelimg.com"}
url ="http://image.angelimg.com/00000mx00000/9iM1QyCDV5QJQ5xz2cMC307208/u5utBb7buzfQnHaoSP3u307208-juEeGN.jpg"
# requests模块get或post方法中的strem参数默认是Flase,如果为True表示从服务器获取套接字响应
response = requests.get(url,headers=headers,stream=True)
# requests.get()或者requests.post()方法默认全部下载内存中,下载完成才存在硬盘,可以用Response.iter_content()方法边下载边存储到硬盘
response.raise_for_status() # 请求错误时候raise_for_status()会抛出异常,正常会是返回结果是None
with open("a.jpg","wb") as f:
for chunk in response.iter_content(chunk_size=1024): # chunk_size表示字节数,这里1024个字节也就是1k大小
f.write(chunk)
f.flush()
注意以上适合处理下载大文件操作。
【requests常见异常】
requests.exceptions.SSLError
requests.exceptions.ProxyError
requests.exceptions.ConnectTimeout
requests.exceptions.Timeout
requests.exceptions.HTTPError
requests.exceptions.ConnectTimeout
requests.exceptions.ReadTimeout
2、编码
【1】例子把“\u5408\u5ddd”转汉字
city_domain =city_info[0].encode("utf-8").decode("unicode_escape")
city_name = city_info[1].encode("utf-8").decode("unicode_escape")
【2】把字体加密取出为&#x开头
code_list = font.getGlyphOrder()[2:]
new_list = [code.replace('uni', '\\u') for code in code_list]
print('替换之后', new_list)
text = ''.join(new_list)
# print(text)
text = text.encode('utf-8').decode('unicode_escape')
注意:这种加密的字体尽量采用正则表达式提取
self.browser.get("https://maoyan.com/films/1212608")
time.sleep(3)
html = Selector(self.browser.page_source)
# box_office是一个parsel.selector.SelectorList对象,这个对象是列表类型
box_office = html.xpath("//p[text()='累计票房']/following-sibling::div[1]/span[@class='stonefont']")
# box_office[0]是正文对应的Selector对象,Selector对象的root属性获得整个网页结果信息,页面结构信息是一个HtmlElement对象,etree.tostring()返回是一个二进制类型
piaofang = etree.tostring(box_office[0].root).decode()
print(piaofang) #获取页面源码 <span class="stonefont"></span>
【例子】
html_str = r'\u8fd8\u6ca1\u6709\u4eba\u8bc4\u8bba\uff0c\u8d76\u5feb\u62a2\u4e2a\u6c99\u53d1'
print(html_str.replace('\/', '/').encode().decode('unicode-escape')) #还没有人评论,赶快抢个沙发