1. 首先,你需要一个可以查很多城市PM值的网站,这个就不错:http://www.chapm25.com
2. 然后就是分析每个城市的链接规律了,http://www.chapm25.com/city/1.html,发现他们是用数字排列的
3. 连接数据库操作,在数据了里面建好表和字段
4. 根据所有城市的数字代号拼接URL
5. 使用BeautifulSoup将网页中特定的信息爬出来
6. 将获取的信息进行编码转换,存入到数据库里面
7. 搞定,收工
代码如下:
# -*- coding:utf8 -*-
#首先用于确定编码,加上这句
import urllib2
import chardet
import MySQLdb
from bs4 import BeautifulSoup
webURL = 'http://www.chapm25.com'
try:
conn=MySQLdb.connect(host='localhost',user='root',passwd='root',db='kfxx',port=3306,charset='utf8')
cur=conn.cursor()
#遍历所有205个城市的URL
for i in range(1,206):
if(i<92 or (i>101 and i<130) or (i>140 and i != 168)):
cityURL = 'http://www.chapm25.com/city/' + str(i) + '.html'
print cityURL
#解决乱码问题
html_1 = urllib2.urlopen(cityURL,timeout=120).read()
mychar = chardet.detect(html_1)
bianma = mychar['encoding']
if bianma == 'utf-8' or bianma == 'UTF-8':
html = html_1
else :
html = html_1.decode('gb2312','ignore').encode('utf-8')
chapter_soup = BeautifulSoup(html)
city = chapter_soup.find('div',class_ = 'row-fluid').find('h1').get_text()
province = chapter_soup.find('a',class_ = 'province').get_text()
pmNum = chapter_soup.find('div',class_ = 'row-fluid').find('span').get_text()
suggest = chapter_soup.find('div',class_ = 'row-fluid').find('h2').get_text()
rand = chapter_soup.find('div',class_ = 'row-fluid').find('h2').find_next_sibling('h2').get_text()
face = chapter_soup.find('div',class_ = 'span4 pmemoji').find('h1').get_text()
conclusion = chapter_soup.find('h1',class_ = 'review').get_text()
print city.encode('utf-8')
cur.execute('insert into t_pm values(\''+city.encode('utf-8')
+'\',\''+province.encode('utf-8')
+'\',\''+pmNum.encode('utf-8')
+'\',\''+suggest.encode('utf-8')
+'\',\''+rand.encode('utf-8')
+'\',\''+conclusion.encode('utf-8')+'\')')
conn.commit() #插入后用来提交动作
cur.close()
conn.close()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
还不错吧,用来给自己的网站润色吧。