同城旅游网 爬虫练习
- 类库安装
- pip install requests
- pip install beautifulsoup4
- 代码
import requests
from bs4 import BeautifulSoup
import os
def get_html(url):
"""
获取html源码
:param url: 链接地址
:return: html源码
"""
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"}
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
if response.status_code == 200:
return response.text
else:
print("网络访问出错")
def parse_html(html):
"""
解析html源码
:param html: html代码
:return: 返回数据List
"""
soup = BeautifulSoup(html, 'lxml')
items = soup.select("#tagList > ul > li > div.line-info > div")
infoList = []
for each in items:
each_soup = BeautifulSoup(str(each), 'lxml')
info = {
'type': each_soup.select('div.line-imgbox > span')[0].string,
'title': each_soup.select('p.line-title > b')[0].string,
'price': each_soup.select('div.line-pricebox > div > p')[0].em.next_sibling,
}
if len(each_soup.select('p.sat-num')) == 0:
info['satisfied'] = 'None'
else:
info['satisfied'] = each_soup.select('p.sat-num')[0].em.previous_sibling
infoList.append(info)
return infoList
def save_file(path, text):
"""
文本存储
:param path: 存储路径
:param text: 文本内容
:return: None
"""
file_name = path.split("/")[-1]
dir_path = path.strip(file_name)
if not os.path.exists(dir_path):
os.mkdir(dir_path)
with open(path, 'w', encoding='UTF-8') as file:
file.write(text)
if __name__ == '__main__':
url = 'https://www.ly.com/dujia/taiguo-lvyou/f394/'
html = get_html(url)
infoList = parse_html(html)
save_file('./lyinfo.json', str(infoList).replace('\'', '\"'))