1. 首先,你需要一个可以查很多城市PM值的网站,这个就不错:http://www.chapm25.com
2. 然后就是分析每个城市的链接规律了,http://www.chapm25.com/city/1.html,发现他们是用数字排列的
3. 连接数据库操作,在数据了里面建好表和字段
4. 根据所有城市的数字代号拼接URL
5. 使用BeautifulSoup将网页中特定的信息爬出来
6. 将获取的信息进行编码转换,存入到数据库里面
7. 搞定,收工
代码如下:
01.
# -*- coding:utf8 -*-
02.
#首先用于确定编码,加上这句
03.
import
urllib2
04.
import
chardet
05.
import
<A
class
=keylink href=
"http://www.it165.net/database/dbmy/"
target=_blank>MySQL</A>db
06.
from bs4
import
BeautifulSoup
07.
08.
webURL =
'http://www.chapm25.com'
09.
10.
try
:
11.
conn=<A
class
=keylink href=
"http://www.it165.net/database/dbmy/"
target=_blank>MySQL</A>db.connect(host=
'localhost'
,user=
'root'
,passwd=
'root'
,db=
'kfxx'
,port=
3306
,charset=
'utf8'
)
12.
cur=conn.cursor()
13.
14.
#遍历所有
205
个城市的URL
15.
for
i in range(
1
,
206
):
16.
if
(i<
92
or (i>
101
and i<
130
) or (i>
140
and i !=
168
)):
17.
cityURL =
'http://www.chapm25.com/city/'
+ str(i) +
'.html'
18.
print cityURL
19.
#解决乱码问题
20.
html_1 = urllib2.urlopen(cityURL,timeout=
120
).read()
21.
mychar = chardet.detect(html_1)
22.
bianma = mychar[
'encoding'
]
23.
if
bianma ==
'utf-8'
or bianma ==
'UTF-8'
:
24.
html = html_1
25.
else
:
26.
html = html_1.decode(
'gb2312'
,
'ignore'
).encode(
'utf-8'
)
27.
chapter_soup = BeautifulSoup(html)
28.
city = chapter_soup.find(
'div'
,class_ =
'row-fluid'
).find(
'h3'
).get_text()
29.
province = chapter_soup.find(
'a'
,class_ =
'province'
).get_text()
30.
pmNum = chapter_soup.find(
'div'
,class_ =
'row-fluid'
).find(
'span'
).get_text()
31.
suggest = chapter_soup.find(
'div'
,class_ =
'row-fluid'
).find(
'h4'
).get_text()
32.
rand = chapter_soup.find(
'div'
,class_ =
'row-fluid'
).find(
'h4'
).find_next_sibling(
'h4'
).get_text()
33.
face = chapter_soup.find(
'div'
,class_ =
'span4 pmemoji'
).find(
'h3'
).get_text()
34.
conclusion = chapter_soup.find(
'h3'
,class_ =
'review'
).get_text()
35.
print city.encode(
'utf-8'
)
36.
cur.execute(
'insert into t_pm values(\''
+city.encode(
'utf-8'
)
37.
+
'\',\''
+province.encode(
'utf-8'
)
38.
+
'\',\''
+pmNum.encode(
'utf-8'
)
39.
+
'\',\''
+suggest.encode(
'utf-8'
)
40.
+
'\',\''
+rand.encode(
'utf-8'
)
41.
+
'\',\''
+conclusion.encode(
'utf-8'
)+
'\')'
)
42.
43.
conn.commit() #插入后用来提交动作
44.
cur.close()
45.
conn.close()
46.
except MySQLdb.Error,e:
47.
print
"Mysql Error %d: %s"
% (e.args[
0
], e.args[
1
])