import requests,re
proxy={
"HTTP": "113.3.152.88:8118",
"HTTPS": "219.234.5.128:3128",
}
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}
base_url="https://www.shanbay.com/wordlist/110521/232414/?page="
#url="https://www.shanbay.com/wordlist/110521/232414/?page=1"
def get_word():
page=input("请输入获取第几页:")
url = base_url+page
response=requests.get(url,headers=headers,proxies=proxy)
html=response.content.decode("utf-8")
res = re.compile(r'<strong>(.*?)</strong>[\w\W]*?"span10">(.*?)</td>')
ret = res.findall(html)
return ret,page
def write_word(ret,page):
file_name="扇贝单词第%s页.txt"%page
with open(file_name, "w", encoding="utf-8") as fp:
for i in ret:
print(i[0] + "\t" + i[-1])
fp.write(i[0] + "\t" + i[-1] + "\n")
print(str(len(ret))+"个单词写完")
if __name__=="__main__":
ret,page=get_word()
write_word(ret,page)
不懂的可以看一下我上一篇写的博文
紧接上一个博客写了两个简单的requests爬取: https://blog.csdn.net/weixin_44185953/article/details/85722993.