一、说明
由于国家线快出了,故写了一份爬取小木虫网站调剂信息的爬虫代码,方便信息查看。此代码仅用于学习,不作为任何商业用途。
本代码可爬取小木虫任何年份,任何专业的调剂信息。
二、代码
#!~/opt/anaconda3/bin/python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
# 获取网页
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ''
# 获取数据
def getDataInfo(infoList, url, pre_params, *args):
params = []
count = -1
for i in args:
count += 1
par_ = pre_params[count] + i
params.append(par_)
# 根据参数获取访问链接
for param in params:
url += param + '&'
# print(url)
html = getHTMLText(url)
soup = BeautifulSoup(html, 'html.parser')
# 获取页码数,并处理空页异常
try:
pages_tag = soup.find_all(