网站页面如下:
F12显示源码:
图片地址和文章的标题都比较容易定位,实现代码如下:
import requests
from bs4 import BeautifulSoup
import os
def get_html(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
resp = requests.get(url, headers=headers).text
return resp
def all_page():
base_url = 'https://www.hansight.com/blog?page='
urllist = []
for page in range(1,4):
allurl = base_url + str(page)+'&size=10'
urllist.append(allurl)
return urllist
def htmp_prase():
for url in all_page():
soup = BeautifulSoup(get_html(url),&