小白一枚,用了大半夜时间结合教程写了个爬虫
爬取网站:女神吧
大佬请指教
import requests
import re
import urllib.request
def get_img(url):
headers = {'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 Edg/83.0.478.58'}
req = requests.get(url)
html = req.text
p = r'<img class="BDE_Image" src="([^"]*\.jpg)'
imglist = re.findall(p, html)
for each in imglist:
filename = each.split('/')[-1]
urllib.request.urlretrieve(each, filename, None)
def find_pageurl(url):
req = requests.get(url)
html = req.text
p = '<a rel="noreferrer" href="([^"]*)'
imglist = re.findall(p, html)
del imglist[0:9]
imglist1 = []
for i in imglist:
i = 'https://tieba.baidu.com' + i
imglist1.append(i)
return imglist1
def main():
url = 'https://tieba.baidu.com/f?kw=%E5%A5%B3%E7%A5%9E&ie=utf-8'
img_list = find_pageurl(url)
for i in img_list:
get_img(i)
if __name__ == '__main__':
main()