准备
- python IDLE
- robots的使用:域名后追加robots.txt可查看
- 安装requests库:管理员启动cmd命令pip install requests
- 安装beautifulsoup4库:管理员启动cmd命令pip install beautifulsoup4
实例
- 爬取单张图片
import requests
import os
url = "https://i0.hippopx.com/photos/320/918/427/sky-clouds-sunlight-dark-thumb.jpg"
root = "D://pics//"
def getPics(url,root):
path = root + url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
- 爬取大学排名
import requests
import bs4
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
r = requests.get( url,timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist,html):
soup = BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].string, tds[1].string, tds[3].string])
def printUnivList(ulist,num):
print("{:^6}\t{:<10}\t{:^6}".format("排名","学校名称","总分"))
for i in range(num):
u = ulist[i]
print("{:^6}\t{:<10}\t{:^6}".format(u[0],u[1],u[2]))
def main():
uinfo = []
url = "http://www.zuihaodaxue.com/Greater_China_Ranking2019_0.html"
html = getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,50)
main()
- 爬取豆瓣电压Top250
import requests
import bs4
import random
from bs4 import BeautifulSoup
tplt =