任务描述:将麦子学院指定网页下教师信息(姓名,职称,介绍信息)爬取下来并保存到数据库。
1.页面分析:
2.代码:
mydb.py:
#!/usr/bin/env/python
#coding:utf-8
'''
操作数据库
'''
import MySQLdb as db
class DBHelper():
def __init__(self,tableName):
self.tableName=tableName
try:
self.conn=db.connect(
host='localhost',
port = 3306,
user='root',
passwd='root',
db ='pythondb',
charset='utf8'
)
self.cursor=self.conn.cursor()
except Exception as e:
print(e)
def createTable(self,pros,types):
sql='create table '+self.tableName+'('
for i in range(len(pros)):
if i==0:
sql+=pros[i]+' '+types[i]
else:
sql+=','+pros[i]+' '+types[i]
sql+=')'
self.cursor.execute(sql)
def insert(self,sql):
try:
print(sql)
self.cursor.execute(sql)
print('insert successfully!')
except Exception as e:
print('insert failed!')
self.conn.rollback()
def delete(self,sql):
try:
print(sql)
self.cursor.execute(sql)
print('delete successfully!')
except Exception as e:
print('delete failed!')
self.conn.rollback()
def queryBySql(self,sql):
return self.cursor.execute(sql)
def queryAll(self):
self.cursor.execute('select * from '+self.tableName)
# 获取所有记录列表
results = self.cursor.fetchall()
return results
def close(self):
self.cursor.close()
self.conn.commit()
self.conn.close()
if __name__=='__main__':
print('test mydb DBHelper')
helper=DBHelper('teacher')
# pros=['name','title','production']
# types=['varchar(20)','varchar(50)','varchar(200)']
# dbhelper.createTable(pros,types)
sql='insert into teacher values("李希","成都莫比乌斯科技创始人","精通Windows及Linux系统平台的运维、大型分布式架构网站的部署和管理,具有15年资深IT从业经验。")'
helper.insert(sql)
for x in helper.queryAll():
print(x)
helper.close()
mymodel.py:
#!/usr/bin/env/python
#coding:utf-8
class Teacher():
def __init__(self,name,title,production):
self._name=name
self._title=title
self._production=production
def get_name(self):
return self._name
def set_name(self,value):
self._name=value
def get_title(self):
return self._title
def set_title(self,value):
self._title=value
def get_production(self):
return self._production
def set_production(self,value):
self._production=value
def __str__(self):
return 'name ='+self.name+',title ='+self.title+',production ='+self.production
name=property(get_name,set_name)
title=property(get_title,set_title)
production=property(get_production,set_production)
if __name__=='__main__':
print('test mymodel Teacher')
p=Teacher('a','t','p')
print(p)
p.name='aa'
p.title='tt'
p.production='pp'
print(p)
main.py:
#!/usr/bin/env/python
#coding:utf-8
import mydb,mymodel
import urllib
from urllib import request
import re
class SpiderMan:
def __init__(self,url):
self.url=url
self.dbhelper=mydb.DBHelper('teacher')
def crawl(self):
#pattern
pattern_div=r"<div class='sliderPlay' id='sliderPlay'>[\s\S]*div id='btnBox' class='btnBox'>"
pattern_name=r'<p class="first">\s*(.+)\s*</p>'
pattern_title=r'<p class="second">\s*(.+)\s*</p>'
pattern_production=r'<p class="third">\s*(.+)\s*</p>'
#request
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
'Host':'www.maiziedu.com',
'Referer':'www.maiziedu.com'
}
req=request.Request(self.url,headers=headers)
#response
resp=request.urlopen(req)
html=resp.read().decode('utf-8')
#analysis
html_div=re.search(pattern_div,html).group()
name_list=re.findall(pattern_name,html_div)
title_list=re.findall(pattern_title,html_div)
production_list=re.findall(pattern_production,html_div)
# print("name_list:")
# print(name_list)
# print("title_list:")
# print(title_list)
# print("production_list:")
# print(production_list)
#save
for i in range(len(name_list)):
name=name_list[i]
title=title_list[i]
production=production_list[i]
sql='insert into '+self.dbhelper.tableName+' values('
sql+='"'+name+'"'+','+'"'+title+'"'+','+'"'+production+'"'
sql+=')'
self.dbhelper.insert(sql)
#close
self.dbhelper.close()
if __name__=='__main__':
url='http://www.maiziedu.com/line/python/'
spider=SpiderMan(url)
spider.crawl()
3.运行结果: