import requests
import re
url = "https://www.sohu.com"
url_list = []
crawl_urls=0
save_page_num = 0
r=requests.get(url)
html = r.text
#print(html)
urls=re.findall(r'href="(.*?)"',html) #
for url in urls:
#print(url)#提取到了所有网页上的url
url=url.strip()#去掉url的前后空格
if url.startswith("mailto"):
continue
elif url.endswith("ico") or url.endswith("png") \
or url.endswith("css") or url.endswith("jpg") or url.endswith("js"):
continue
elif url.startswith("javascript"):
continue
elif url=="/":
continue
elif url.startswith("//"):
url = "https:" +url
url_list.append(url)
else:
url_list.append(url)
for url in url_list:
print(url)
if not url:
continue
crawl_urls+=1
r=requests.get(url)
if "汽车" in r.text:
save_page_num+=1
with open("e:\\count\\"+str(save_page_num)+".html","w",encoding="utf-8") as fp:
fp.write(r.text)
print("一共爬了%s个网页" %crawl_urls)
print("一共保存了%s个网页" %save_page_num)
Python 爬取特定内容的网页并保存
最新推荐文章于 2024-09-28 16:04:32 发布