一般使用python写爬虫,比较常用的有request库、beautifulsoup4 库和Scrapy框架,但是有一些区别: 像request库和beautifulsoup4 插件库可以用来写轻量级的爬虫,而 Scrapy是一套更加完善的爬虫框架,类似于在写java的时候使用原始的jdbc来操作数据库还是使用mybatis框架操作数据库一样,使用解析库比较请便,但是Scrapy框架更加完善。做爬虫还得提到一个selenium库了,一般写python的自动化脚本会用到,用来模拟网页的点击和登录之类的。
本次我们用beautifulsoup4写一个爬虫示例:
#!/usr/bin/python
# coding:utf-8
from urllib.error import HTTPError
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql
# 爬取院校信息
def finfAllinformation(url):
try:
html = urlopen(url)
except HTTPError as e:
return None
try:
bsObj = BeautifulSoup(html.read())
allData = bsObj.findAll("table", {"class": "ch-table"})
except AttributeError as e:
return None
return allData
# 处理信息的方法
def handData(info):
if info == None:
print("没有院校信息")
else:
dataList = []
school_list = []
# print(info)
print(type(info))
for item in info:
list = item.findAll("tr")
for x in list:
school = x.findAll("td")
# print(school)
if len(school):
school_list.append(school[0:3])
else:
continue
for item in school_list:
school_name = item[0].get_text().strip()
school_shenfen = item[1].get_text().strip()
school_belong = item[2].get_text().strip()
print(school_name, school_shenfen, school_belong)
print("----------------------------------------------------------------------")
tup = (school_name, school_shenfen, school_belong)
dataList.append(tup)
return dataList
# 放入数据库
def addMysql(data):
print(str(data))
db = pymysql.connect("10.20.10.161", "root", "mima", "runoob_db")
cursor = db.cursor()
sql = " insert into school(name,shenfen,belong) values (%s,%s,%s)"
val = data
try:
cursor.executemany(sql, val)
db.commit()
print("添加成功:" + str(len(data)))
except Exception as ex:
print("插入出错"+ex)
db.rollback()
finally:
db.close()
data = finfAllinformation("https://yz.chsi.com.cn/sch/?start=0")
datalist=handData(data)
addMysql(datalist)
原文链接:https://blog.csdn.net/qq_40202995/article/details/111930999