爬取财富500强的内容
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url = "http://www.fortunechina.com/fortune500/c/2020-08/10/content_372148.htm"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:
td=j.find_all('td')
number=td[0].get_text().strip()
names=td[1].get_text().strip()
income=td[2].get_text().strip()
profit=td[3].get_text().strip()
country=td[4].get_text().strip()
list_1 = number, income, profit, country, names
list_2 = '{0:<20}\t{1:<20}\t{2:<20}\t{3:<20}\t{4:<20}'.format(number, income, profit, country, names,chr(12288))
print(list_2)
效果图
爬取豆瓣TOP250
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
for i in range(0,250,25):
url="https://movie.douban.com/top250?start={0}&filter=".format(i)
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
names=bs.findAll('span',{'class':"title"})
scores=bs.findAll('span',{'class':"rating_num"})
numbers=bs.findAll('em',{'class':""})
number_list=[]
name_list=[]
score_list=[]
for name in names:
name=name.get_text()
if name[1] != '/':
name_list.append(name)
for number,name,score in zip(numbers,name_list,scores):
score=score.get_text()
number=number.get_text()
print(number,name,score)
效果图
爬取中国大学
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url="http://www.shanghairanking.cn/rankings/bcur/2020"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
#tbody-tr-td
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:
td=j.find_all('td')
number=td[0].get_text().strip()
college=td[1].get_text().strip()
province=td[2].get_text().strip()
type=td[3].get_text().strip()
score=td[4].get_text