关闭

Python urllib 爬取基金数据

标签: python爬虫数据
1461人阅读 评论(0) 收藏 举报
分类:

1.urllib 使用笔记

urlopen(url,data,timeout)
第一个参数url即为URL,第二个参数data是访问URL时要传送的数据,第三个timeout是设置超时时间。
第二三个参数是可以不传送的,data默认为空None,timeout默认为 socket._GLOBAL_DEFAULT_TIMEOUT
第一个参数URL是必须要传送的,在这个例子里面我们传送了百度的URL,执行urlopen方法之后,返回一个response对象,返回信息便保存在这里面。

#!/usr/bin/python
# -*- coding:utf-8 -*-
# urllib2_test.py
# author: sfzoro
# 2015-03-16 sfzoro@163.com

#只能访问最简单的网页,一般是网站的首页
import urllib2

url = 'http://python.org/'
#最简单方式
def use_urllib2():
    try:
       response = urllib2.urlopen('http://python.org/')
       html = response.read()
    except urllib2.URLError, e:
        print e.reason
    print len(html)

if __name__ == "__main__":
  use_urllib2()

2.urllib2 设置 Headers 代理 Timeout

#!/usr/bin/python
# -*- coding:utf-8 -*-
# urllib2_test.py
# author: sfzoro
# 2015-03-17 sfzoro@163.com

import urllib  
import urllib2  

url = 'http://www.server.com/login'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'  
values = {'username' : 'cqc',  'password' : 'XXXX' }  
headers = { 'User-Agent' : user_agent }  
data = urllib.urlencode(values)  
request = urllib2.Request(url, data, headers)  
response = urllib2.urlopen(request)  
page = response.read() 

#对付防盗链,服务器会识别headers中的referer是不是它自己
headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'  ,
                        'Referer':'http://www.zhihu.com/articles' }  

#设置代理
import urllib2
enable_proxy = True
proxy_handler = urllib2.ProxyHandler({"http" : 'http://some-proxy.com:8080'})
null_proxy_handler = urllib2.ProxyHandler({})
if enable_proxy:
    opener = urllib2.build_opener(proxy_handler)
else:
    opener = urllib2.build_opener(null_proxy_handler)
urllib2.install_opener(opener)

#Timeout 设置
import urllib2
response = urllib2.urlopen('http://www.baidu.com', timeout=10)
response = urllib2.urlopen('http://www.baidu.com', data, 10)

3.urllib2 get和post

#!/usr/bin/python
# -*- coding:utf-8 -*-
# urllib2_test.py
# author: sfzoro
# 2015-03-16 sfzoro@163.com
#get方式
import urllib
import urllib2

values={}
values['username'] = "xxxxx@qq.com"
values['password']="XXXX"
data = urllib.urlencode(values) 
url = "http://passport.csdn.net/account/login"
geturl = url + "?"+data
request = urllib2.Request(geturl)
response = urllib2.urlopen(request)
print response.read()



#!/usr/bin/python
# -*- coding:utf-8 -*-
# urllib2_test.py
# author: sfzoro
# 2015-03-16 sfzoro@163.com
#post方式
import urllib
import urllib2

values = {"username":"xxxxx@qq.com","password":"XXXX"}
data = urllib.urlencode(values) 
url = "https://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn"
request = urllib2.Request(url,data)
response = urllib2.urlopen(request)
print response.read()

4.1.Python urllib 爬取基金数据

用的2.7 的python 爬取用urllib不用再下其它的包了,不过代码多的点,下次改用Requests.
主要使用了http://skylark.readthedocs.io/en/latest/ 做ORM了,给个淘宝svn地址,大家自己下代码
http://code.taobao.org/svn/St_Spider/trunk/Stock
如果下载不可以给我邮件,不扯了主代码

#!/usr/bin/python
# -*- coding:utf-8 -*-
# urllib2_test.py
# author: sfzoro
# 2015-03-16 sfzoro@163.com
#get方式
import re
import urllib
import urllib2
import sqlite3
from collections import OrderedDict
from PraseConfig import ParseConfig
from models import Founds
#from bs4 import BeautifulSoup

def srcapydata():
    #获取配置信息
    configdata = ParseConfig().parse()
    values = OrderedDict()
    values['fundcode'] = configdata.fundcode
    values['startdate'] = configdata.startdate
    values['enddate'] = configdata.enddate
    data = urllib.urlencode(values)  
    geturl = configdata.url + "?"+data

    print geturl

    #设置http请求头
    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.154 Safari/537.36 LBBROWSER"
    referer = "http://jingzhi.funds.hexun.com/DataBase/jzzs.aspx?fundcode=163412&startdate=2015-11-11&enddate=2016-04-14"
    accAccept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    headers = { 'User-Agent' : user_agent, 'Referer': referer, 'Accept':accAccept}

    #请求
    request = urllib2.Request(geturl, headers=headers)  
    response = urllib2.urlopen(request)  
    html = response.read()

    #解析
    listData = re.findall( r'<td style="text-align: center;">(.*?)</td>', html, re.M|re.I)
    listBaiFengBi = re.findall( r'<td style="text-align: center;" class="f_.*">(.*?)</td>', html, re.M|re.I)
    listBaiFengBi2 = re.findall( r'<td style="text-align: center;" class="end">(.*?)</td>', html, re.M|re.I)

    #insert 
    timeList = []
    for i in range(len(listData)):
        if(i%2 == 0):
           timeList.append(listData[i])

    valueList = [] 
    for i in range(len(listData)):
        if(i%2 == 1):
           valueList.append(listData[i])


    #创建数据库
    dbPath = "G:\\2016\\coding\\trunk\\Stock\\founds_"
    dbPath += configdata.fundcode
    dbPath += ".db"
    conn = sqlite3.connect(dbPath)
    c = conn.cursor()
    c.execute('''
            CREATE TABLE if not exists "Founds" (
    "id"  INTEGER PRIMARY KEY AUTOINCREMENT,
    "release_date"  TEXT(10) NOT NULL,
    "unit_price"  REAL(10) NOT NULL,
    "accumulative_net_value"  REAL(10),
    "daily_growth_rate"  TEXT(10))
    ''')
    conn.commit()
    conn.close()

    #保持数据
    for i in range(len(timeList)):
        founds = Founds()
        founds.release_date = timeList[i]
        founds.unit_price = valueList[i]
        founds.accumulative_net_value = listBaiFengBi2[i]
        founds.daily_growth_rate = listBaiFengBi[i]
        founds.save()

if __name__ == "__main__":
    srcapydata()


0
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:34476次
    • 积分:662
    • 等级:
    • 排名:千里之外
    • 原创:32篇
    • 转载:4篇
    • 译文:0篇
    • 评论:6条
    最新评论