(一)步骤:
1)、通过网络连接获取内容
2)、对获得内容进行处理
(二)模块:
1)、requests 处理HTTP请求的工具
可获得 URL; HTTP会话; Cookie记录
requests网页请求:
get() ;
post() ;
requests对象属性:
status_code HTTP请求返回状态, 200成功, 400失败。
text HTTP相应内容的字符串形式,即url对应的页面内容。
方法:https://docs.python.org/3/library/urllib.request.html#urllib.request.Request
2)、beautifulsoup
import bs4
DOM document object model
步骤:
a 创建对象:
bs = BeautifuSoup(url, html_parser(指定解析器), enoding (指定编码格式))
b 查询节点 find/find_all
举例:
"""
获取所有程序的api
"""
import requests
from bs4 import BeautifulSoup
def get_all_cities():
""""
"""
url = 'http://pm25.in'
r = requests(url, timeout=50)
soup_city = BeautifulSoup(r.text, 'lxml')
city_div = soup_city.find('div', {'class': 'bottom'})[1]
city_link_list = city_div.find_all('a')
city_list = []
for city_link in city_link_list:
city_name = city_link.text
city_pinyin = city_link['href'][1:]
city_list.append((city_name, city_pinyin))
return city_list
def get_city_api(city_pinyin):
"""
:param url:
:return: url的内容
"""
url = 'http://pm25.in/' + city_pinyin
r = requests.get(url, timeout=150)#对象
soup = BeautifulSoup(r.text, 'lxml')
div_list = soup.find_all('div', {'class': 'span1'})
city_api = []
for i in range(8):
div_content = div_list[i]
caption = div_content.find('div', {'class': 'caption'}).text.strip()
value = div_content.find('div', {'class': 'value'}).text.strip()
city_api.append((caption, value)) #以元组的方式存储
return city_api
def main():
city_list = get_all_cities()
city_api = []
for city in city_list:
city_name = city[0]
city_pinyin = city[1]
url_text = get_city_api(city_pinyin)
print(url_text)
city_api.append(url_text)
if __name__ == '__main__':
main()