1.urllib 使用笔记
urlopen(url,data,timeout)
第一个参数url即为URL,第二个参数data是访问URL时要传送的数据,第三个timeout是设置超时时间。
第二三个参数是可以不传送的,data默认为空None,timeout默认为 socket._GLOBAL_DEFAULT_TIMEOUT
第一个参数URL是必须要传送的,在这个例子里面我们传送了百度的URL,执行urlopen方法之后,返回一个response对象,返回信息便保存在这里面。
#!/usr/bin/python
# -*- coding:utf-8 -*-
# urllib2_test.py
# author: sfzoro
# 2015-03-16 sfzoro@163.com
#只能访问最简单的网页,一般是网站的首页
import urllib2
url = 'http://python.org/'
#最简单方式
def use_urllib2():
try:
response = urllib2.urlopen('http://python.org/')
html = response.read()
except urllib2.URLError, e:
print e.reason
print len(html)
if __name__ == "__main__":
use_urllib2()
2.urllib2 设置 Headers 代理 Timeout
#!/usr/bin/python
# -*- coding:utf-8 -*-
# urllib2_test.py
# author: sfzoro
# 2015-03-17 sfzoro@163.com
import urllib
import urllib2
url = 'http://www.server.com/login'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {'username' : 'cqc', 'password' : 'XXXX' }
headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(values)
request = urllib2.Request(url, data, headers)
response = urllib2.urlopen(request)
page = response.read()
#对付防盗链,服务器会识别headers中的referer是不是它自己
headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' ,
'Referer':'http://www.zhihu.com/articles' }
#设置代理
import urllib2
enable_proxy = True
proxy_handler = urllib2.ProxyHandler({"http" : 'http://some-proxy.com:8080'})
null_proxy_handler = urllib2.ProxyHandler({})
if enable_proxy:
opener = urllib2.build_opener(proxy_handler)
else:
opener = urllib2.build_opener(null_proxy_handler)
urllib2.install_opener(opener)
#Timeout 设置
import urllib2
response = urllib2.urlopen('http://www.baidu.com', timeout=10)
response = urllib2.urlopen('http://www.baidu.com', data, 10)
3.urllib2 get和post
#!/usr/bin/python
# -*- coding:utf-8 -*-
# urllib2_test.py
# author: sfzoro
# 2015-03-16 sfzoro@163.com
#get方式
import urllib
import urllib2
values={}
values['username'] = "xxxxx@qq.com"
values['password']="XXXX"
data = urllib.urlencode(values)
url = "http://passport.csdn.net/account/login"
geturl = url + "?"+data
request = urllib2.Request(geturl)
response = urllib2.urlopen(request)
print response.read()
#!/usr/bin/python
# -*- coding:utf-8 -*-
# urllib2_test.py
# author: sfzoro
# 2015-03-16 sfzoro@163.com
#post方式
import urllib
import urllib2
values = {"username":"xxxxx@qq.com","password":"XXXX"}
data = urllib.urlencode(values)
url = "https://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn"
request = urllib2.Request(url,data)
response = urllib2.urlopen(request)
print response.read()
4.1.Python urllib 爬取基金数据
用的2.7 的python 爬取用urllib不用再下其它的包了,不过代码多的点,下次改用Requests.
主要使用了http://skylark.readthedocs.io/en/latest/ 做ORM了,给个淘宝svn地址,大家自己下代码
http://code.taobao.org/svn/St_Spider/trunk/Stock
如果下载不可以给我邮件,不扯了主代码
#!/usr/bin/python
# -*- coding:utf-8 -*-
# urllib2_test.py
# author: sfzoro
# 2015-03-16 sfzoro@163.com
#get方式
import re
import urllib
import urllib2
import sqlite3
from collections import OrderedDict
from PraseConfig import ParseConfig
from models import Founds
#from bs4 import BeautifulSoup
def srcapydata():
#获取配置信息
configdata = ParseConfig().parse()
values = OrderedDict()
values['fundcode'] = configdata.fundcode
values['startdate'] = configdata.startdate
values['enddate'] = configdata.enddate
data = urllib.urlencode(values)
geturl = configdata.url + "?"+data
print geturl
#设置http请求头
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.154 Safari/537.36 LBBROWSER"
referer = "http://jingzhi.funds.hexun.com/DataBase/jzzs.aspx?fundcode=163412&startdate=2015-11-11&enddate=2016-04-14"
accAccept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
headers = { 'User-Agent' : user_agent, 'Referer': referer, 'Accept':accAccept}
#请求
request = urllib2.Request(geturl, headers=headers)
response = urllib2.urlopen(request)
html = response.read()
#解析
listData = re.findall( r'<td style="text-align: center;">(.*?)</td>', html, re.M|re.I)
listBaiFengBi = re.findall( r'<td style="text-align: center;" class="f_.*">(.*?)</td>', html, re.M|re.I)
listBaiFengBi2 = re.findall( r'<td style="text-align: center;" class="end">(.*?)</td>', html, re.M|re.I)
#insert
timeList = []
for i in range(len(listData)):
if(i%2 == 0):
timeList.append(listData[i])
valueList = []
for i in range(len(listData)):
if(i%2 == 1):
valueList.append(listData[i])
#创建数据库
dbPath = "G:\\2016\\coding\\trunk\\Stock\\founds_"
dbPath += configdata.fundcode
dbPath += ".db"
conn = sqlite3.connect(dbPath)
c = conn.cursor()
c.execute('''
CREATE TABLE if not exists "Founds" (
"id" INTEGER PRIMARY KEY AUTOINCREMENT,
"release_date" TEXT(10) NOT NULL,
"unit_price" REAL(10) NOT NULL,
"accumulative_net_value" REAL(10),
"daily_growth_rate" TEXT(10))
''')
conn.commit()
conn.close()
#保持数据
for i in range(len(timeList)):
founds = Founds()
founds.release_date = timeList[i]
founds.unit_price = valueList[i]
founds.accumulative_net_value = listBaiFengBi2[i]
founds.daily_growth_rate = listBaiFengBi[i]
founds.save()
if __name__ == "__main__":
srcapydata()