python 爬虫
1.爬虫基本框架
import requests
url = "https://cn.bing.com/search?q=%E7%99%BE%E5%BA%A6&form=ANSPH1&refig=6636054266074ec18daef89910da4de7&pc=CNNDDB"
ll = requests.get(url)
ll.status_code
ll.encoding
ll.headers
ll.text
2.爬虫小说
import requests
from bs4 import BeautifulSoup
def get_chapter_content(url,encondingstype):
r = requests.get(url)
r.encoding=encondingstype
soup = BeautifulSoup(r.text,"html.parser")
return soup.find("main",id="content").get_text()
def main(url,out_path,encondingstype):
ll = requests.get(url)
if ll.status_code == 200:
print(f"url可以打开")
soup = BeautifulSoup(ll.text, "html.parser")
nodes = soup.find_all("li", )
co = 0
for node in nodes:
link = node.find("a", style="padding:8px 0;display:flex;")
if link == None:
continue
co += 1
new_url, title = url.replace("/chapterlist/22087153000372102", "") + "%s" % link["href"], link.get_text()
print(f"正在爬取章节《{title}》")
with open(out_path + "\\" + f"{co}_{title.replace(' ','')}.txt", "w") as f:
f.write(get_chapter_content(new_url,encondingstype))
else:
print("爬取内容有反爬机制!")
if __name__ == '__main__':
url = "https://www.xxsy.net/chapterlist/22087153000372102"
out_path = r"E:\Destbook\笔记\技能\爬虫\案例\novels"
c = requests.get(url)
encondingstype = "utf-8"
main(url,out_path,encondingstype)
3.爬虫美女图片
import os
import requests
from bs4 import BeautifulSoup
def GetCraw(url):
ts = requests.get(url)
if ts.status_code != 200:
print(f"该{url}有反爬机制!")
return
ts.encoding = "gbk"
con = BeautifulSoup(ts.text, "html.parser")
imames = con.find_all("img")
return imames
def DownImage(imames,out_path):
for image in imames:
if "uploads" not in image["src"]:
continue
image_name = os.path.basename(image["src"])
src = f"{url.replace('/4kmeinv/', '')}{image['src']}"
with open(out_path + "\\" + image_name, "wb") as f:
resp_image = requests.get(src)
f.write(resp_image.content)
def SinglePage(url,out_path):
ts = requests.get(url)
if ts.status_code != 200:
print(f"该{url}有反爬机制!")
return
ts.encoding="gbk"
con = BeautifulSoup(ts.text,"html.parser")
imames = con.find_all("img")
for image in imames:
if "uploads" not in image["src"]:
continue
image_name = os.path.basename(image["src"])
src = f"{url.replace('/4kmeinv/','')}{image['src']}"
with open(out_path+"\\"+image_name,"wb") as f:
resp_image = requests.get(src)
f.write(resp_image.content)
def ManyPage(url,out_path):
ManySrc = [url] + [url + f"index_{i}.html" for i in range(2,11) ]
for i,urll in enumerate(ManySrc):
images = GetCraw(urll)
DownImage(images,out_path)
print(f"第{i+1}页面已经爬取完成!")
if __name__ == '__main__':
url = r"https://pic.netbian.com/4kmeinv/"
out_path = r"E:\Destbook\笔记\技能\爬虫\案例\BeautalfulGirl"
ManyPage(url,out_path)