说明:记录一下自己爬虫的学习过程 嘻嘻……
前提:
安装 requests模块,re模块,time模块
代码如下:
import requests
import re
import time
def get_url(url, text='src'):
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 '
'Safari/537.36'}
res = requests.get(url, headers=headers, timeout=5)
if text == 'src':
res.encoding = res.apparent_encoding
return res.text
return res.content
def get_str_list(start_text, end_text, text):
start = text.find(start_text) + len(start_text)
end = text.find(end_text)
return text[start:end]
def data_cleaning(src_list, regular, condition):
pattern = re.compile(regular)
src = re.findall(pattern, src_list)
src_list = [i for i in src if i.find(condition) > 0]
return src_list
def save_img(img_text):
with open(f'{(time.time())}.jpg', 'wb') as f:
f.write(img_text)
print("save success")
def format_img(text):
start_text = '<div class="list">'
end_text = '<div class="page">'
str_list = get_str_list(start_text, end_text, text)
src_list = data_cleaning(str_list, 'href="(.*?)"', 'desk')
for src in src_list:
url = f'http://www.netbian.com{src}'
text = get_url(url, 'src')
str_list = get_str_list('<div class="pic">', '<div class="pic-down">', text)
src_list = data_cleaning(str_list, 'img src="(.*?)"', 'jpg')
for img in src_list:
img_text = get_url(img, 'img')
save_img(img_text)
def main():
for index in range(1, 10):
if index == 1:
url = f'http://www.netbian.com/meinv/index.htm'
else:
url = f'http://www.netbian.com/meinv/index_{index}.htm'
text = get_url(url)
format_img(text)
if __name__ == '__main__':
main()