python部分语法

ubuntu下安装Python scrapy

1.ubuntu已自带了python

2.安装scrapy

先安装pip
apt-get install python-pip

再安装scrapy
pip install scrapy
但是pypi源一直访问不了,需要指定下面的源
http://pypi.douban.com/simple/
http://pypi.v2ex.com/simple/

sudo pip install -i http://pypi.douban.com/simple/ scrapy




win7 64 安装 easy_install
执行下面脚本
http://peak.telecommunity.com/dist/ez_setup.py


会把easy_install.exe下载到C:\Python27\Scripts
把C:\Python27\Scripts加入环境变量PATH

安装pip
easy_install pip
会把pip.exe下载到C:\Python27\Scripts

不过,安装这个版本的python的时候,这两个都已经自动给装上了


安装scrapy
pip install scrapy

会把scrapy安装到 C:\Python27\Lib\site-packages


http://blog.csdn.net/dreamzml/article/details/8847879
http://jingyan.baidu.com/article/e73e26c0d94e0524adb6a7ff.html

1.

https://www.python.org/downloads/release/python-343/
https://www.python.org/download/releases/2.7.7/

python 3.4

import sys
sys.path.append('c:/')#导入path路径

import hello#导入hello.py ,hello.py只有一行,内容是:x=1.2
print(hello.x)

>>> dir(hello)
['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'x']

>>> print(hello.__file__)
C:\Python34\hello.py

>>> print(hello.__name__)
hello

python输出列表或字典时,如果是中文则显示utf8的编码字符,如下例
# -*- coding: utf-8 -*-
list1=['你好abc','那你abc']
print list1
print list1[0]

dic1={'egg':'你好abc','cc':'那你abc'}
print dic1
print dic1['egg']

['\xe4\xbd\xa0\xe5\xa5\xbdabc', '\xe9\x82\xa3\xe4\xbd\xa0abc']
你好abc
{'cc': '\xe9\x82\xa3\xe4\xbd\xa0abc', 'egg': '\xe4\xbd\xa0\xe5\xa5\xbdabc'}
你好abc


2.
import urllib
content = urllib.urlopen('http://www.baidu.com').read()
print(content)
import urllib2
content = urllib2.urlopen('http://www.baidu.com').read()
print(content)
import urllib2
req = urllib2.Request('http://www.baidu.com')
content = urllib2.urlopen(req).read()
print content
import urllib2
req = urllib2.Request('http://www.baidu.com/')
req.add_header('User-Agent', 'fake-client')
res = urllib2.urlopen(req)
print res.read()
import urllib
import urllib2
url = 'http://www.baidu.com'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }

values = {'name' : 'WHY',
          'location' : 'SDU',
          'language' : 'Python' }
data = urllib.urlencode(values)

req = urllib2.Request(url, data, headers)
res = urllib2.urlopen(req)
print res.headers.items()


import urllib2
req = urllib2.Request('http://www.baidu.com')
res = urllib2.urlopen(req)
item = res.headers.items()
print item

正则表达式
# -*- coding: utf-8 -*-  
import re  
match = re.search(r' world' , 'hello world!')
print match.group(0)
print match.group(1)
print match.group(2)

<pre style="font-family: 宋体; font-size: 9pt; background-color: rgb(255, 255, 255);">
 

贪婪模式a.*b. 匹配除了换行符以外的任意字符* 表示有0个或多个前面的那个字符 ,而这个是字符却是点,代表不必是都相同的字符 所以组合起来就是匹配最长的以a开始,以b结束的字符串

import re
x='aak7bka7kb'
y=re.findall('a.*b',x)
print y
['7bka7k']


懒惰模式
a.*?b
匹配最短的,以a开始,以b结束的字符串
.*加上一个? 就意味着匹配任意数量的重复,但是在能使整个匹配成功的前提下使用最少的重复
__author__ = 'songl'
import re
x='aak7bka7kb'
y=re.findall('a.*?b',x)
print y

['aak7b', 'a7kb']

http://deerchao.net/tutorials/regex/regex.htm

使用requests第三方库带台urllib,urllib2

#-*—coding:utf8-*-
import requests
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
html = requests.get('http://news.fx678.com/news/top/index.shtml',headers = hea)
html.encoding = 'utf-8' #这一行是将编码转为utf-8否则中文会显示乱码。
chinese = re.findall(r'class="touzi_font"(.*?)</div>',html.text,re.S)
for each in chinese:
    hhh =  re.findall(r'target=\"_blank\">(.*?)</a></h1>',each,re.S)
    print hhh[0]

汇通网所有新闻标题
先把<div class="touzi_font">的标签内容全部匹配出来
再处理该div内部的各个小标签
#-*—coding:utf8-*-
import requests
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

def findweb(url):
    hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
    html = requests.get(url,headers = hea)
    html.encoding = 'utf-8' #这一行是将编码转为utf-8否则中文会显示乱码。
    chinese = re.findall('class="touzi_font"(.*?)</div>',html.text,re.S)
    for each in chinese:
        hhh =  re.findall('target="_blank">(.*?)</a></h1>',each,re.S)
        print hhh[0]

url ='http://news.fx678.com/news/top/index.shtml'
findweb(url)

for i in range(1,11):
    new_url = re.sub('index','index%d'%i,url,re.S)
    findweb(new_url)

                                <div class="touzi_font">
                                    <h1><a href="/C/20151017/201510171648121936.shtml" οnclick="add_click('201510171648121936')" target="_blank">周评:核心CPI助美元重夺失地,下周关注中国GDP及欧银...</a></h1>
                                    <p><a href="/C/20151017/201510171648121936.shtml" οnclick="add_click('201510171648121936')" target="_blank">美元指数本周下跌0.16%,收报94.73,一度触及七周低位93.80,因因美联储或进一步推迟升息忧虑加重,而本周稍晚公布的美国核心CPI上升帮助美元挽回了大部分跌势。</a></p>
                                </div>


                                <div class="clear"></div>
                            </li>
       											<li>
                            	<div class="new_6_rt"><span>文/</span><a href="/news/Editor/193/index.shtml">飞鱼</a></div>                                
                                <div class="clock_touzi">2015年10月17日 16:16:41 </div>
                                <div class="clear"></div>
                                <div class="new_6_pic">                                	
                               	    <a href="/C/20151017/201510171616241935.shtml" οnclick="add_click('201510171616241935')" target="_blank"><img src=http://upload.fx678.com/upload/ht/20151017/sl_2015101716135261.jpg width="145px" height="110px" /></a>
                                </div>


                                <div class="touzi_font">
                                    <h1><a href="/C/20151017/201510171616241935.shtml" οnclick="add_click('201510171616241935')" target="_blank">周评:推迟升息忧虑发酵,黄金涨近2%触及四个月高位</a></h1>
                                    <p><a href="/C/20151017/201510171616241935.shtml" οnclick="add_click('201510171616241935')" target="_blank">现货黄金本周(10月12日至10月16日)大涨1.81%,收报1176.97美元/盎司,周四(10月15日)一度触及四个月高位1191.48美元/盎司,因美联储或进一步推迟升息的忧虑大大鼓舞了黄金多头。不过本周稍晚公布的美国核心CPI上升令黄金回吐了...</a></p>
                                </div>



                                <div class="clear"></div>
                            </li>
       											<li>
                            	<div class="new_6_rt"><span>文/</span><a href="/news/Editor/185/index.shtml">龙舞</a></div>                                
                                <div class="clock_touzi">2015年10月17日 07:28:14 </div>
                                <div class="clear"></div>
                                <div class="new_6_pic">                                	
                               	    <a href="/C/20151017/201510170728141851.shtml" οnclick="add_click('201510170728141851')" target="_blank"><img src=http://upload.fx678.com/upload/ht/20151017/sl_2015101707194377.jpg width="145px" height="110px" /></a>
                                </div>


                                <div class="touzi_font">
                                    <h1><a href="/C/20151017/201510170728141851.shtml" οnclick="add_click('201510170728141851')" target="_blank">美元油价宁死不屈,下周OPEC再掀腥风血雨</a></h1>
                                    <p><a href="/C/20151017/201510170728141851.shtml" οnclick="add_click('201510170728141851')" target="_blank">10月16日美国经济数据好坏参半,美元仍然在升息预期和风险偏好的助推下小升,在近来疲软数据和中国放缓忧虑的施压下连续两日反弹。原油价格也录得良好表现,空头回补和钻井平台数据助推油价上涨2%。下周原油市场的焦点将转向OP...</a></p>
                                </div>


获取极客学院的课程表
#-*—coding:utf8-*-
import requests
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

def findweb(url):
    hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
    html = requests.get(url,headers = hea)
    html.encoding = 'utf-8' #这一行是将编码转为utf-8否则中文会显示乱码。
    chinese = re.findall('(<li id="\d*" test="0" deg="0" >.*?</li>)',html.text,re.S)

    for each in chinese:
        # print each
        learn =  re.search('<em class="learn-number">(.*?)</em>',each,re.S).group(1)
        jibie =  re.search('<i class="xinhao-icon\d*"></i><em>(.*?)</em>',each,re.S).group(1)
        keshi =  re.search('<dd class="mar-b8"><i class="time-icon"></i><em>(.*?)</em>',each,re.S).group(1)
        title =  re.search('class="lessonimg" title="(.*?)>',each,re.S).group(1)
        print title
        print learn
        print jibie
        print keshi
        print '\n'

url ='http://www.jikexueyuan.com/course/?pageNum=1'
findweb(url)




用requests获取utf-8编码的网站
ide是pycharm
1.需要加上设置utf-8的头

#-*—coding:utf8-*-
import requests
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
html = requests.get('http://www.baidu.com',headers = hea)
html.encoding = 'utf-8'
print html.text
2..py页面右下角选择utf-8


3.使用快捷键ctrl+alt+s 打开 setting,定位到editor->fileencoding 
把project encoding设置为utf-8




用requests获取gbk2312编码的网站
1.使用requests的content,而不是text
import requests
import re

hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
html = requests.get('http://cl.loei.pw/thread0806.php?fid=21',headers = hea)
print html.content
2.使用快捷键ctrl+alt+s 打开 setting,定位到editor->fileencoding 
把project encoding设置为GBK

xpath的使用
// 定位根节点
/ 往下层寻找
/text() 提取文本内容
/@xxx 提取属性内容
[@id="xx"] 过滤只留下id是xx的
[@class="xx"] 过滤只留下class是xx的
[@yy="xx"] 过滤只留下yy是xx的
几个例子


所有元素下的div标签下的ul下的li的text属性
content = selector.xpath('//*/div/ul/li/text()')
所有元素的text属性
content = selector.xpath('//*/text()')
id是useful的元素下的所有li的text属性
content = selector.xpath('//*[@id="useful"]/li/text()')
所有li元素的text属性
content = selector.xpath('//li/text()')
下面三个相同content = selector.xpath('//*/body/text()')content = selector.xpath('//body/text()')content = selector.xpath('/html/body/text()')
如果跟一个/,就必须是根节点才行,否则需要两个/

有的时候通过360浏览器的右键copy xpath,得到的xpath与填入到selector.xpath的参数不太一样。比如div的编号可能由于动态变化而导致不一样。
遇到解不到的情况下,调试方法是从头到尾排查,看是哪个节点编号有错位

content = selector.xpath('/html/body/*')  #获取body节点的所有子节点
print(content)
content = selector.xpath('/html/body/@*')  #获取body节点的所有属性
print(content)


content = selector.xpath('/html/body/div[1]/*')  #获取div[1]节点的所有属性
print(content)
content = selector.xpath('/html/body/div[1]/@*')  #获取div[1]节点的所有属性
print(content)

content = selector.xpath('/html/body/div[4]/div[1]/div[2]/div[5]/table/tbody/tr[6]/td/div[1]')  
print(content)
for each in content: 
innerhtml = etree.tostring(each) #将一个div节点的所有html内容转化为字符串,赋给innerhtml 
print innerhtml 

content = selector.xpath('/html/body/div[4]/div[1]/div[2]/div[5]/table/tbody/tr[6]/td/div[1]/text()')  
print(content)
for each in content: 
innerhtml = each#将一个div节点的所有text(不包括子节点)赋给innerhtml 
print innerhtml 

content = selector.xpath('/html/body/div[4]/div[1]/div[2]/div[5]/table/tbody/tr[6]/td/div[1]')  
for each in content:  
siteurl = each.xpath('string(.)') #提取当前div节点下和所有子节点孙节点..的所有text,赋给siteurl 
print siteurl  



安装lxml 
pip install lxml

#-*-coding:utf8-*-
from lxml import etree
html = '''
<!DOCTYPE html>
<html>
<head lang="en">
    <meta charset="UTF-8">
    <title>测试-常规用法</title>
</head>
<body>
<div id="content">
    <ul id="useful">
        <li>这是第一条信息</li>
        <li>这是第二条信息</li>
        <li>这是第三条信息</li>
    </ul>
    <ul id="useless">
        <li>不需要的信息1</li>
        <li>不需要的信息2</li>
        <li>不需要的信息3</li>
    </ul>

    <div id="url">
        <a href="http://jikexueyuan.com">极客学院</a>
        <a href="http://jikexueyuan.com/course/" title="极客学院课程库">点我打开课程库</a>
    </div>
</div>

</body>
</html>
'''

selector = etree.HTML(html)

#提取文本
content = selector.xpath('//ul[@id="useful"]/li/text()')
for each in content:
    print each

#提取属性
link = selector.xpath('//a/@href')
for each in link:
    print each

title = selector.xpath('//a/@title')
print title[0]


使用xpath获取汇通网新闻首页新闻标题

#-*—coding:utf8-*-
import requests
from lxml import etree
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

def findweb(url):
    hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
    html = requests.get(url,headers = hea)
    html.encoding = 'utf-8' #这一行是将编码转为utf-8否则中文会显示乱码。
    selector = etree.HTML(html.text)
    #提取文本
    content = selector.xpath('//*[@id="analysis_ul"]/li/div[5]/h1/a/text()')
    for each in content:
        print each

url ='http://news.fx678.com/news/top/index.shtml'
findweb(url)

使用下一页的链接获取汇通网所有新闻标题

#-*—coding:utf8-*-
import requests
from lxml import etree
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

def findweb(url):
    hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
    html = requests.get(url,headers = hea)
    html.encoding = 'utf-8' #这一行是将编码转为utf-8否则中文会显示乱码。
    selector = etree.HTML(html.text)
    #提取文本
    content = selector.xpath('//*[@id="analysis_ul"]/li/div[5]/h1/a/text()')
    for each in content:
        print each

    nextlink = selector.xpath('//div[@class="hc_content"]/div[@class="analysis_inter_left"]/div[@class="hc_new_6"]/div[@class="my_pg"]/a[@class="nxt"]/@href')
    print nextlink
    if nextlink:
        nextlink = nextlink[0]
        print nextlink
        url = 'http://news.fx678.com/news/top/' + nextlink
        findweb(url)

url ='http://news.fx678.com/news/top/index.shtml'
findweb(url)



下载图片
#-*-coding:utf8-*-
import re
import requests
pic_url =['http://pic.meizitu.com/wp-content/uploads/2015a/10/27/01.jpg',
          'http://pic.meizitu.com/wp-content/uploads/2015a/08/11/01.jpg']
i = 0
for each in pic_url:
    print 'now downloading:' + each
    pic = requests.get(each)
    fp = open('pic\\' + str(i) + '.jpg','wb')
    fp.write(pic.content)
    fp.close()
    i += 1

python mysql
apt-get install  python-mysqldb
# -*- coding: utf-8 -*- 
from MySQLdb import *
def conn():
	cn=Connection('192.168.1.100','root','','meizi1688')
	#Connection()函数的参数依次为
	#     host(string, host to connect);
	#     user(string, user to connect as);
	#     passwd(string, password to use);
	#     db(string, database to use)
	#也可以这样选择数据库
	#cn.select_db('test')
	cur=cn.cursor()
	cur.execute('select * from mz_url')
	#设置游标的位置,不设置默认为0
	#cur.scroll(0)
	row=cur.fetchall()
	#查询游标位置的一条记录,返回值为元组
	print row

	#写入      
	sql = "insert into mz_url(name,url) values(%s,%s)"     
	param = ("aaa",'bb')      
	n = cur.execute(sql,param)      
	print 'insert',n      
	#提交      
	cn.commit()  
	#关闭      
	cn.close()   


if __name__=='__main__':
	conn()


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值