import requests
from lxml import etree
if __name__ == '__main__':
url = "http://pic.netbian.com/4kmeinv/"
headers = {
'user-agent': 'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/85.0.4183.121Safari/537.36',
}
page_html = requests.get(url=url, headers=headers)
page_html.encoding = 'gbk'
page_html = page_html.text
# print(page_html)
tree = etree.HTML(page_html)
li_list = tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
img_base_link = li.xpath('./a/img/@src')[0]
img_link = "http://pic.netbian.com" + img_base_link
img_alt = li.xpath('./a/img/@alt')[0]
img_data = requests.get(url=img_link, headers=headers).content
img_name = img_alt + ".jpg"
filepath = "data/" + img_name + ".jpg"
fp = open(filepath, 'wb')
fp.write(img_data)
fp.close()
print(img_name + "爬取完毕")
print("爬取结束")
其中遇到了乱码问题,原因是源网页编码用的是gbk,response默认使用iso-8859-1编码,怎么看源网页编码方式,打开源网页,搜索charset
可以看到愿望的编码方式。
解决方法一:
将response编码方式改为与网页一样,这里是gbk
page_html = requests.get(url=url, headers=headers)
page_html.encoding = 'gbk'
page_html = page_html.text
解决二:
将由response的iso-8859-1编码方式出来的乱码,重新解码成gbk
img_name = img_name.encode(“iso-8859-1”).decode(‘gbk’)