说明:
- 数据暂时不能爬取全,由于频繁请求会超时
- 建议配置多个代理轮询调用爬去
- 只爬取到区的信息,可以不用上代理
# -*-coding:utf-8 -*-
import urllib2
import sys
# 接上面代码
from bs4 import BeautifulSoup as bs
reload(sys)
sys.setdefaultencoding('GBK')
def get_url_content(url):
i_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5", \
"Referer": 'http://www.baidu.com'}
req = urllib2.Request(url, headers=i_headers)
# proxies = {"http": "114.244.112.220:8118"} # 设置你想要使用的代理
# proxy_s = urllib2.ProxyHandler(proxies)
# opener = urllib2.build_opener(proxy_s)
# urllib2.install_opener(opener)
return urllib2.urlopen(req, timeout=10).read().decode('GBK')
html_data = get_url_conte