一. 运行结果
image.png
爬取结果存储至MySQL数据库如下图所示。
image.png
运行结果及保存TXT文件如下所示:
image.png
二. BeautifulSoup爬虫详解
# python2.7
# -*- coding: utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
import codecs
import MySQLdb
# 存储数据库
# 参数:职位名称 公司名称 职位月薪 工作地点 发布时间 职位链接
def DatabaseInfo(zwmc, gsmc, zwyx, gzdd, gxsj, zwlj):
try:
conn = MySQLdb.connect(host='127.0.0.1', user='root',
passwd='123456', port=3306, db='zhilian')
cur = conn.cursor() # 数据库游标
# 报错:UnicodeEncodeError: 'latin-1' codec can't encode character
conn.set_character_set('utf8')
cur.execute('SET NAMES utf8;')
cur.execute('SET CHARACTER SET utf8;')
cur.execute('SET character_set_connection=utf8;')
# SQL语句 智联招聘(zlzp)
sql = 'insert into zhilian_zlzp' \
'(zwmc,gsmc,zwyx,gzdd,gxsj,zwlj) ' \
'values(%s, %s, %s, %s, %s, %s)'
cur.execute(sql, (zwmc, gsmc, zwyx, gzdd, gxsj, zwlj))
print '数据库插入成功'
# 异常处理
except MySQLdb.Error, e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
finally:
cur.close()
conn.commit()
conn.close()
# 爬虫函数
def crawl(url):
#page = urllib2.urlopen(url)
headers= {
'Host':'sou.zhaopin.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
page = requests.get(url,headers=headers)
contents = page.text
soup = BeautifulSoup(contents, "html.parser")
#print u'贵阳JAVA招聘信息: 职位名称 \t 公司名称 \t 职位月薪 \t 工作地点 \t 发布日期 \n'
infofile.write(u"贵阳JAVA招聘信息: 职位名称 \t 公司名称 \t 职位月薪 \t 工作地点 \t 发布日期 \r\n")
print u'爬取信息如下:\n'
i = 0
for tag in soup.find_all(attrs={"class": "newlist"}):
# print tag.get_text()
i = i + 1
# 职位名称
zwmc = tag.find(attrs={"class": "zwmc"}).get_text()
zwmc = zwmc.replace('\n', '')
print zwmc
# 职位链接
url_info = tag.find(attrs={"class": "zwmc"}).find_all("a")
# print url_info
# url_info.get(href) AttributeError: 'ResultSet' object has no attribute 'get'
ins = ''
for u in url_info:
zwlj = u.get('href')
ins = zwlj
print zwlj
# 公司名称
gsmc = tag.find(attrs={"class": "gsmc"}).get_text()
gsmc = gsmc.replace('\n', '')
print gsmc
# find另一种定位方法
8000-16000zz = tag.find_all('td', {"class": "zwyx"})
#print zz
# 职位月薪
zwyx = tag.find(attrs={"class": "zwyx"}).get_text()
zwyx = zwyx.replace('\n', '')
print zwyx
# 工作地点
gzdd = tag.find(attrs={"class": "gzdd"}).get_text()
gzdd = gzdd.replace('\n', '')
print gzdd
# 发布时间
gxsj = tag.find(attrs={"class": "gxsj"}).get_text()
gxsj = gxsj.replace('\n', '')
print gxsj
# 获取当前日期并判断写入文件
import datetime
now_time = datetime.datetime.now().strftime('%m-%d') # %Y-%m-%d
print now_time
if True:
print u'存入文件'
infofile.write(u"[职位名称]" + zwmc + "\r\n")
infofile.write(u"[公司名称]" + gsmc + "\r\n")
infofile.write(u"[职位月薪]" + zwyx + "\r\n")
infofile.write(u"[工作地点]" + gzdd + "\r\n")
infofile.write(u"[发布时间]" + gxsj + "\r\n")
infofile.write(u"[职位链接]" + ins + "\r\n\r\n")
#else:
#print u'日期不一致,当前日期: ', now_time
#####################################
# 重点:写入MySQL数据库
#####################################
if True:
print u'存入数据库'
DatabaseInfo(zwmc, gsmc, zwyx, gzdd, gxsj,ins)
print '\n\n'
else:
print u'爬取职位总数', i
# 主函数
if __name__ == '__main__':
infofile = codecs.open("Result_ZP.txt", 'a', 'utf-8')
# 翻页执行crawl(url)爬虫
i = 1
while i <= 1:
print u'页码', i
url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?in=160400&jl=厦门&kw=python&p=' + str(
i) + '&isadv=0'
crawl(url) infofile.write("###########################\r\n\r\n\r\n")
i = i + 1
infofile.close()
三. 数据库操作
CREATE TABLE `eastmount_zlzp` (
`ID` int(11) NOT NULL AUTO_INCREMENT,
`zwmc` varchar(100) COLLATE utf8_bin DEFAULT NULL COMMENT '职位名称',
`gsmc` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '公司名称',
`zwyx` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '职位月薪',
`gzdd` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '工作地点',
`gxsj` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '发布时间',
`zwlj` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '职位链接',
`info` varchar(200) COLLATE utf8_bin DEFAULT NULL COMMENT '详情',
PRIMARY KEY (`ID`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
显示如下图所示:
image.png
# coding:utf-8
import MySQLdb
try:
conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',port=3306, db='test01')
cur=conn.cursor()
#插入数据
sql = '''''insert into student values(%s, %s, %s)'''
cur.execute(sql, ('yxz','111111', '10'))
#查看数据
print u'\n插入数据:'
cur.execute('select * from student')
for data in cur.fetchall():
print '%s %s %s' % data
cur.close()
conn.commit()
conn.close()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])