from bs4 import BeautifulSoup #beautifulsoup4库使用时是简写的bs4
import requests
import pandas as pd
r = requests.get('http://blackarchitect.us/')
demo = r.text
soup = BeautifulSoup(demo, 'html.parser') #解析器:html.parser
data_city = soup.find_all('td', valign="top", width="110") # 取出数据
data_state = soup.find_all('td', valign="top", width="47", align='center')
data_state_license = soup.find_all('td', valign="top", width="60", align='center')
list_city = []
list_state = []
list_state_license = []
for i in data_city:
list_city.append(i.text)
for i in data_state:
list_state.append(i.text)
for i in data_state_license:
list_state_license.append(i.text)
# print(list1)
print(len(list_state), len(list_state_license), len(list_city)) #判断数据是否缺少
df = pd.DataFrame({'city':list_city, 'state':list_state, 'state of license':list_state_license})
df.to_csv('US.csv')
比较基础的爬虫程序,使用bs4和requests模块就够了,这个网页的数据量比较大,解析完网页后,取出里面的数据即可。