项目需要,需要爬一个网站的信息
话不多说,直接上代码
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
url = "xx" # 特殊原因,保密
res = urlopen(url).read()
soup = BeautifulSoup(res, 'html.parser')
# 得到具体的item信息,根据具体的标签分析所想爬取的内容所在的标签
items = soup.find_all(name='li', attrs={'id': re.compile(r"^item-(\s\w+)?")})
all_content = []
page_num = 20 # 这里网页总数是固定的,直接定死
for page_index in range(page_num):
base_url = url + '/page/' + str(page_index) # 遍历每页网页
for item in items:
print("new item")
sub_url = item.find_all('a')[0].attrs['href'] # 找到item所在的具体页面
sub_res = urlopen(sub_url)
# print('sub_url:', sub_url)
sub_soup = BeautifulSoup(sub_res, 'html.parser')
info_text = sub_soup.find_all(name='div', attrs={'class': re.compile(r"^entry-content(\s\w+)?")})[0].get_text() # 找到具体的信息
# print('info:', info_text)
address = info_text.split('地址:')[1].split('\n')[0]
district = address.split('北京')[1].split('区')[0]
telephone = info_text.split('电话:')[1].split('\n')[0]
print("地址:{}, 区:{}, 电话:{}, ".format(address, district, telephone))
all_content.append({'address': address, 'district': district ,'telephone': telephone})