import requests # python HTTP客户端库,编写爬虫和测试服务器响应数据会用到的类库
import re
import json
from bs4 import BeautifulSoup
import copy
print('正在爬取网页链接……')
List = []
for page in range(8):
if page == 0:
url = 'http://usagz.bailitop.com/public/'
else:
url = 'http://usagz.bailitop.com/public/' + str(page + 1) + '.html'
print('-----------正在爬取第' + str(page + 1) + '页------')
html = requests.get(url)
html.raise_for_status()
html.encoding = 'utf-8'
try:
soup = BeautifulSoup(html.text, 'html.parser')
soup = str(soup)
# 正则表达式找到网页链接
href = re.compile('http://usagz\.bailitop\.com/public/\d*/\d*\.html')
URLs = re.findall(href, soup)
flag = 0
# 过滤前面重复的3条
for webUrl in URLs:
flag = flag + 1
if flag > 4:
List.append(webUrl)
# 每个页面15条数据
except Exception as e:
print(e)
print(List)
# 创建字典
data = {'title': '', 'content': '', 'time': ''}
dataList = []
for webSite in List:
print('\n')
html = requests.get(webSite)
html.raise_for_status()
html.encoding = 'utf-8'
try:
soup = BeautifulSoup(html.text, 'html.parser')
soup = str(soup)
# 标题
reg = re.compile('<div id="CLM_one">.*<h1>(.*)</h1>.*</div>', re.S)
title = re.findall(reg, soup)
title = title[0]
if title.count('img'):
# 去前面的标签
title = title.split('>', 1)
title = title[1]
# 去后面的标签
title = title.split('<', 1)
title = title[0]
# 日期
reg = re.compile('\d{4}-\d\d-\d\d')
date = re.findall(reg, soup)
date = date[0]
# 正文
reg = re.compile('<div class="center_main">(.*)</div>.*<div class="text-c" id="pages"', re.S)
content = re.findall(reg, soup)
content = content[0]
# 替换文本
content = content.replace('百利天下', '智课')
# 更新字典信息
data['title'] = title
data['content'] = content
data['time'] = date
# 加入List
dataList.append(data)
# 更改字典地址
data = copy.copy(data)
except Exception as e:
print(e)
# 转换json,注意编码
jsonList = json.dumps(dataList, ensure_ascii=False)
print(jsonList)
# 写入文件
with open("record.json", "w", encoding='utf-8') as f:
f.write(jsonList)
print("加载入文件完成...")
import requests # python HTTP客户端库,编写爬虫和测试服务器响应数据会用到的类库
import re
import json
from bs4 import BeautifulSoup
import copy
import urllib.request
import urllib.parse
def downloadPostPage(url, dictdata, headers, charset='utf-8', reqnum=5):
data = bytes(urllib.parse.urlencode(dictdata), encoding=charset)
req = urllib.request.Request(url, data, headers=headers, method='POST')
info = None
try:
response = urllib.request.urlopen(req)
info = response.read().decode(charset)
except Exception as e:
# 服务器错误
print(e)
return info
if __name__ == '__main__':
dic = {
'title': '标题',
'abstract': '摘要',
'studentInfo': {
'study_exp': '最高教育经历',
'school_type': '院校类型',
# 成绩分类 托福 小托福 SSAT SLEP
'grade': []
},
'offerInfo': {
'school': 'value1',
'degree': 'value2',
'date': 'value2'
},
'paragraphs': [
{'title': '标题1', 'content': 'content1'},
{'title': '标题2', 'content': 'content2'},
{'title': '标题3', 'content': 'content3'},
{'title': '标题4', 'content': 'content4'},
{'title': '标题5', 'content': 'content5'}
]
}
dicList = []
urlList = []
url = 'http://case.bailitop.com/cases/yuanxiaoajax.shtml'
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
}
for jzgd in range(1):
dictdata = {
'jzgd': 4, 'type': 2, 'mbcountry': '美国', 'mbdegree': '高中', 'univ': '', 'major': '', 'gpa': '',
'toefl': '',
'ielts': ''
}
# 请求url , 请求参数dictdata , 构造的头headers , 请求次数reqnum
info = downloadPostPage(url, dictdata, headers=headers, reqnum=1)
jsonLoads = json.loads(info)[0]
reg = re.compile('http://case\.bailitop\.com/yuanxiao/\d*\.shtml')
urlList = urlList + re.findall(reg, jsonLoads)
print(urlList)
for webSite in urlList:
print('\n')
html = requests.get(webSite)
html.raise_for_status()
html.encoding = 'utf-8'
try:
data1 = {'type': '', 'value': ''}
soup = BeautifulSoup(html.text, 'html.parser')
# 正文
soupContent = soup.find("div", class_="anli_wenzhang")
title1 = str(soupContent.p)
reg = re.compile('【(.*)】', re.S)
title = re.findall(reg, title1)
# print(title)
flag = 1
content1 = ''
content2 = ''
content3 = ''
content4 = ''
content5 = ''
title2 = ''
title3 = ''
title4 = ''
title5 = ''
for sibling in soupContent.p.next_siblings:
sibling = str(sibling)
sibling = sibling.replace('\r', '')
sibling = sibling.replace('\n', '')
if sibling.count('【'):
flag = flag + 1
reg = re.compile('【(.*)】', re.S)
if flag == 2:
title2 = re.findall(reg, sibling)
elif flag == 3:
title3 = re.findall(reg, sibling)
elif flag == 4:
title4 = re.findall(reg, sibling)
elif flag == 5:
title5 = re.findall(reg, sibling)
else:
if flag == 1:
if content1 == '':
content1 = content1 + sibling
else:
content1 = content1 + '\n' + sibling
elif flag == 2:
if content2 == '':
content2 = content2 + sibling
else:
content2 = content2 + '\n' + sibling
elif flag == 3:
if content3 == '':
content3 = content3 + sibling
else:
content3 = content3 + '\n' + sibling
elif flag == 4:
if content4 == '':
content4 = content4 + sibling
else:
content4 = content4 + '\n' + sibling
elif flag == 5:
if content5 == '':
content5 = content5 + sibling
else:
content5 = content5 + '\n' + sibling
content1 = content1.replace('</p>', '')
content1 = content1.replace('<p>', '')
content2 = content2.replace('</p>', '')
content2 = content2.replace('<p>', '')
content3 = content3.replace('</p>', '')
content3 = content3.replace('<p>', '')
content4 = content4.replace('</p>', '')
content4 = content4.replace('<p>', '')
content5 = content5.replace('</p>', '')
content5 = content5.replace('<p>', '')
content3 = content3.replace('百利天下', '智课')
content4 = content4.replace('百利天下', '智课')
content5 = content5.replace('百利天下', '智课')
content1 = content1.replace('\u3000', '')
content2 = content2.replace('\u3000', '')
content3 = content3.replace('\u3000', '')
content4 = content4.replace('\u3000', '')
content5 = content5.replace('\u3000', '')
content5 = content5.replace('\n', '')
content5 = content5.replace('<br/>', '')
if content5.count('<p'):
reg = re.compile('(.*?)<p', re.S)
content5 = re.findall(reg, content5)
dic['paragraphs'][0]['title'] = title[0]
dic['paragraphs'][1]['title'] = title2[0]
dic['paragraphs'][2]['title'] = title3[0]
dic['paragraphs'][3]['title'] = title4[0]
dic['paragraphs'][4]['title'] = title5[0]
dic['paragraphs'][0]['content'] = content1
dic['paragraphs'][1]['content'] = content2
dic['paragraphs'][2]['content'] = content3
dic['paragraphs'][3]['content'] = content4
dic['paragraphs'][4]['content'] = content5
soup = str(soup)
# 摘要 有个\r\n 手动删
reg = re.compile('<p><strong>摘要</strong>:(.*)</p>.*<div.*class="zixun">', re.S)
abstract = re.findall(reg, soup)[0]
abstract = abstract.replace('百利天下', '智课')
dic['abstract'] = abstract
print(abstract)
# title 个别<strong>未解决,手动删
reg = re.compile('<h2>(.*)</h2>', re.S)
title = re.findall(reg, soup)[0]
if title.count('<strong') == 1:
reg = re.compile('<strong.*?>(.*)', re.S)
title = re.findall(reg, title)[0]
title = title.replace('</strong>', '')
dic['title'] = title
print(title)
# offer详情
reg = re.compile(
'<p>录取院校:(.*)</p>\n<p></p>\n<p>授予学位:(.*)</p.*<p>入学时间:(.*?)</p>\n</div>\n<div class="g_btns">', re.S)
offerInfo = re.findall(reg, soup)[0]
dic['offerInfo']['school'] = offerInfo[0]
dic['offerInfo']['degree'] = offerInfo[1]
dic['offerInfo']['date'] = offerInfo[2]
print(offerInfo)
# 学生档案
reg = re.compile(
'<p>最高教育经历:(.*)</p>\n<p>院校类型:(.*)</p>\n<p></p>\n<p>语言成绩:(.*?)</p>', re.S)
studentInfo = re.findall(reg, soup)
if len(studentInfo) == 0:
# 只有 最高教育经历 院校类型
reg = re.compile(
'<p>最高教育经历:(.*)</p>\n<p>院校类型:(.*?)</p>\n<p></p>', re.S)
studentInfo = re.findall(reg, soup)
if len(studentInfo) == 0:
# 只有 院校类型 语言成绩
reg = re.compile(
'<p>院校类型:(.*?)</p>\n<p></p>\n<p>语言成绩:(.*?)</p>', re.S)
studentInfo = re.findall(reg, soup)
studentInfo = studentInfo[0]
grade = studentInfo[1]
grade = grade.replace(' ', ' ')
grade = grade.replace(';', '')
grade = grade.replace(' ', ' ')
dic['studentInfo']['study_exp'] = ''
dic['studentInfo']['school_type'] = studentInfo[0]
# dic['studentInfo']['grade'] = grade
print('院校类型:', studentInfo[0], '||语言成绩:', grade)
reg = re.compile(r' ')
gradeList = re.split(reg, grade)
for n in range(int((len(gradeList) - 1) / 2)):
data1['type'] = gradeList[n * 2]
data1['value'] = gradeList[n * 2 + 1]
dic['studentInfo']['grade'].append(data1)
data1 = copy.deepcopy(data1)
else:
studentInfo = studentInfo[0]
dic['studentInfo']['study_exp'] = studentInfo[0]
dic['studentInfo']['school_type'] = studentInfo[1]
# dic['studentInfo']['grade'] = ''
print('最高教育经历:', studentInfo[0], '||院校类型:', studentInfo[1])
else:
studentInfo = studentInfo[0]
grade = studentInfo[2]
grade = grade.replace(' ', ' ')
grade = grade.replace(';', '')
grade = grade.replace(' ', ' ')
dic['studentInfo']['study_exp'] = studentInfo[0]
dic['studentInfo']['school_type'] = studentInfo[1]
# dic['studentInfo']['grade'] = grade
print('最高教育经历:', studentInfo[0], '||院校类型:', studentInfo[1], '||语言成绩:', grade)
reg = re.compile(r' ')
gradeList = re.split(reg, grade)
for n in range(int((len(gradeList) - 1) / 2)):
data1['type'] = gradeList[n * 2]
data1['value'] = gradeList[n * 2 + 1]
dic['studentInfo']['grade'].append(data1)
data1 = copy.copy(data1)
dicList.append(dic)
dic = copy.deepcopy(dic)
dic['studentInfo']['grade'].clear()
except Exception as e:
print(e)
jsonList = json.dumps(dicList, ensure_ascii=False)
print(jsonList)
# 写入文件
with open("CaseRecord.json", "w", encoding='utf-8') as f:
f.write(jsonList)
print("加载入文件完成...")