Python 简单爬虫
爬取了51job 网站关于上海、北京、广州、深圳地区有关python的薪资,运用正则表达式,把爬取的数据保存到文本文档,mysql中。
代码
.
# -*- coding:utf-8 -*-
import urllib.request
import re
import sqlite3
def get_content(page, key):
url = 'https://search.51job.com/list/010000%252C020000%252C030200%252C040000,000000,0000,00,9,99,' + key + ',2,' + str(page) + '.html'
a = urllib.request.urlopen(url)
html = a.read().decode('gbk')
lst = re.findall(r'<span class="t3">(北京|上海|广州|深圳).*?</span>\s+<span class="t4">(\d+\.?\d?)-(\d+\.?\d?)(万|千)/(年|月)</span>', html)
return lst
conn = sqlite3.connect('51.db')
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS jobs
(key text, addr text, min float, max float)''')
c.execute('''delete from jobs''')
conn.commit()
with open('51.txt', 'w') as f:
f.write('%s\t%s\t%s\t%s\n' % ('key','addr','min','max'))
for key in ('python', 'java'):
for each in range(1, 11):
for items in get_content(each, key):
min = float(items[1])
max = float(items[2])
if items[3] == "千":
min /= 10
max /= 10
if items[4] == "年":
min /= 12
max /= 12
f.write('%s\t%s\t%s\t%s\n' % (key, items[0], round(min, 2), round(max, 2)))
c.execute("INSERT INTO jobs VALUES (?,?,?,?)", (key, items[0], round(min, 2), round(max, 2)))
conn.commit()
conn.close()
if __name__ == '__main__':
lst = get_content(1, 'python')
print(lst)
刚学python,可能有些地方不对,请大家批评指正,谢谢!