#coding=utf-8 a = 10 b = 20 print a<b or a==b print a<b and a==b print not True ################################################################################################## print str.__doc__ ################################################################################################## a = "tom" b = "jerry" print a+b ################################################################################################## a = "xYz" print a.islower() ################################################################################################## path = "D:\\testpython.txt" f = open(path, "w") f.write("First line 1.\n") f.write("First line 2.") f.close() f = open(path, "r") for line in f: print line f.close() ################################################################################################## #异常处理 s=raw_input("Input your age:") if s =="": raise Exception("Input must no be empty.") try: i=int(s) except ValueError: print "Could not convert data to an integer" except: print "Unknown exception" else: # It is useful for code that must be executed if the try clause does not raise an exception print "You are %d" % i," years old" finally: # Clean up action print "Goodbye!" ################################################################################################## #类和继承 class Base: def __init__(self): self.data = [] def add(self, x): self.data.append(x) def addtwice(self, x): self.add(x) self.add(x) class Child(Base): def plus(self, a, b): return a+b ochild = Child() ochild.add("str1") print ochild.data print ochild.plus(2,3) ################################################################################################## import os import os.path # os,os.path里包含大多数文件访问的函数,所以要引入他们. rootdir = "d:\\SiLabs" for parent, dirnames, filenames in os.walk(rootdir): for dirname in dirnames: print (" parent is: " + parent) print (" dirname is: " + dirname) for filename in filenames: print (" parent is: " + parent) print (" filename is: " + filename) ''' 知识点: * os.walk返回一个三元组.其中dirnames是所有文件夹名字(不包含路径),filenames是所有文件的名字(不包含路径).parent表示父目录. * case1 演示了如何遍历所有目录. * case2 演示了如何遍历所有文件. * os.path.join(dirname,filename) : 将形如"/a/b/c"和"d.java"变成/a/b/c/d.java". ''' ################################################################################################## import os.path # 常用函数有三种:分隔路径,找出文件名.找出盘符(windows系统),找出文件的扩展名. spath = "D:\Desktop\Cannon\stm401.rar" # case 1: p,f = os.path.split(spath) print ( " dir is: " + p) print ( " file is: " + f) # case 2: drv,left = os.path.splitdrive(spath); print ( " driver is: " + drv) print ( " left is: " + left) # case 3: f,ext = os.path.splitext(spath); print ( " f is: " + f) print ( " ext is: " + ext) ''' 知识点: 这三个函数都返回二元组. * case1 分隔目录和文件名 * case2 分隔盘符和文件名 * case3 分隔文件和扩展名 ''' ################################################################################################## #coding=utf-8 import os.path import shutil src = "D:/test/myfile1.txt" dst = "D:/test/myfile2.txt" dst2 = "D:\\test\\myfile3.txt" dst3 = "D:/test/测试文件夹.txt" uipath = unicode(dst3 , "utf8") dir1 = os.path.dirname(src) print ("dir1 %s" % dir1) if(os.path.exists(src) == False): os.makedirs(dir1) f1 = open (src,"w") f1.write("line a\n") f1.write("line b\n") f1.close() shutil.copyfile(src,dst) shutil.copyfile(src,dst2) shutil.copyfile(src,uipath) f2 = open(dst, 'r' ) for line in f2: print (line) f2.close() # 测试复制文件夹树 try : srcDir = "D:/test" dstDir = "D:/test2" # 如果dstDir已经存在,那么shutil.copytree方法会报错! # 这也意味着你不能直接用d:作为目标路径. shutil.copytree(srcDir, dstDir) except Exception as err: print (err) ''' 知识点: * shutil.copyfile:如何复制文件 * os.path.exists:如何判断文件夹是否存在 * shutil.copytree:如何复制目录树 ''' ################################################################################################## #coding=utf-8 import os.path import shutil import datetime def BackUpDD(): # add dirs you want to copy backdir = "D:\\temp" copydirs = [] copydirs.append("D:\\test") print (" Copying files =================== ") # gen a data folder for backup start = datetime.datetime.now() backdir = os.path.join(backdir, start.strftime("%Y%m%d")) print copydirs[0] print backdir try: shutil.copytree(copydirs[0], backdir) except Exception as err: print (err) end = datetime.datetime.now() print("Finished! ===================") print ("Elapsed time : " + str((end - start).seconds) + "seconds ") def omitPrefix(fullpath, prefix): #省略前缀 # Giving /media/data/programmer/project/python/tutotial/file/test.py , # and prefix is Giving /media/data/programmer/project/, # return path as python/tutotial/file/test.py print fullpath[len(prefix) + 1:] return fullpath[len(prefix) + 1:] #omitPrefix('D:\\temp\\123.txt', 'D:\\temp') BackUpDD() #元数据也复制了 ################################################################################################## #coding=utf-8 import os copydirs = [] copydirs.append("D:\\test") #是否为文件 print os.path.isfile(copydirs[0]) #是否是绝对路径 print os.path.isabs(copydirs[0]) print os.listdir(copydirs[0]) #执行shell #os.system('ping www.pythontab.com') print os.system('msconfig')#ipconfig ################################################################################################## #coding=utf-8 import urllib2 req = urllib2.Request('http://bbs.csdn.net/callmewhy') #HTTPError是URLError的子类 所以HTTPError在URLError前面 try: urllib2.urlopen(req) except urllib2.HTTPError,e: print 'The server couldn\'t fulfill the request.' print 'Error code: ',e.code except urllib2.URLError, e: print 'We failed to reach a server.' print 'Reason: ',e.reason ################################################################################################## #get 和 post 两种方式进行request网页 import urllib2 url= 'http://www.zhihu.com' values = {} values['username'] = "2826098981@qq.com" values['password'] = "XXXXX" data = urllib.urlencode(values) #post request = urllib2.Request(url,data) #get geturl = url + "?" + data request =urllib2.Request(geturl) response = urllib2.urlopen(request) print 'url:' + url print response.info() ################################################################################################## #反盗链 Referer headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' , 'Referer':'http://www.zhihu.com/articles' } ################################################################################################## #抓取百度贴吧 #coding=utf-8 import string import urllib import urllib2 #定义百度函数 def baidu_tieba(url, begin_page, end_page): for i in range(begin_page, end_page+1): sName = string.zfill(i,5) + '.html'#自动填充成六位的文件名 print '正在下载第' + str(i) + '个网页,并将其存储为' + sName + '......' f = open(sName, 'w+') m = urllib2.urlopen(url + str(i)).read() f.write(m) f.close() # -------- 在这里输入参数 ------------------ # 这个是山东大学的百度贴吧中某一个帖子的地址 # bdurl = 'http://tieba.baidu.com/p/2296017831?pn=' # iPostBegin = 1 # iPostEnd = 10 #bdurl = str(raw_input(u'请输入贴吧的地址,去掉pn=后面的数字:\n')) #begin_page = int(raw_input(u'请输入开始的页数:\n')) #end_page = int(raw_input(u'请输入终点的页数:\n')) bdurl = 'https://tieba.baidu.com/p/297554321?pn=' begin_page = 1 end_page = 5 # -------- 在这里输入参数 ------------------ baidu_tieba(bdurl,begin_page,end_page) ################################################################################################## #可以通过下面的方法把 Debug Log 打开,这样收发包的内容就会在屏幕上打印出来,方便调试,这个也不太常用 #coding=utf-8 import urllib2 httpHandler = urllib2.HTTPHandler(debuglevel=1) httpsHandler = urllib2.HTTPSHandler(debuglevel=1) opener = urllib2.build_opener(httpHandler, httpsHandler) urllib2.install_opener(opener) response = urllib2.urlopen('http://www.baidu.com') ################################################################################################## #coding=utf-8 #爬取糗事百科 import urllib import urllib2 import re import thread import time class spiderModel: def __init__(self): self.page = 1 self.pages = [] self.enable = False def GetPage(self, page): myUrl = "http://m.qiushibaike.com/hot/page/" + page user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} req = urllib2.Request(myUrl, headers = headers) myResponse = urllib2.urlopen(req) myPage = myResponse.read() #encode的作用是将unicode编码转换成其他编码的字符串 #decode的作用是将其他编码的字符串转换成unicode编码 unicodePage = myPage.decode("utf-8") # 找出所有class="content"的div标记 # re.S是任意匹配模式,也就是.可以匹配换行符 #pattern = re.compile(r'(?<=<div)*?(?<=>).*?(?=</div>)') #myItems = re.search(pattern, unicodePage) myItems = re.findall('<div.*?class="content(.*?)>(.*?)</div>', unicodePage, re.S) #<span>(.*?)</span> #<div.*?class"content">(.*?)</=div>, #'<div.*?class="content".*?title="(.*?)">(.*?)</div>' 下来的是分享那个位置的 items = [] for item in myItems: # item 中第一个是div的标题,也就是时间 # item 中第二个是div的内容,也就是内容 items.append([item[0].replace("\n", ""), item[1].replace("\n", "")]) return items def loadPage(self): # 如果用户未输入quit则一直运行 while self.enable: # 如果pages数组中的内容小于2个 if len(self.pages) < 2: try: # 获取新的页面中的段子们 myPage = self.GetPage(str(self.page)) self.page += 1 self.pages.append(myPage) except: print '无法链接糗事百科!' else: time.sleep(1) def ShowPage(self, nowPage, page): you = '|||' are = '|||' for items in nowPage: if items[0]!='"': continue print u'第%d页' % page, you, items[0], are, items[1] myInput = raw_input() if myInput == "quit": self.enable = False break def Start(self): self.enable = True page = self.page print u'正在加载中请稍候......' # 新建一个线程在后台加载段子并存储 thread.start_new_thread(self.loadPage, ()) # ----------- 加载处理糗事百科 ----------- while self.enable: # 如果self的page数组中存有元素 if self.pages: nowPage = self.pages[0] del self.pages[0] self.ShowPage(nowPage, page) page += 1 # ----------- 程序的入口处 ----------- print u""" --------------------------------------- 程序:糗百爬虫 版本:0.4 作者:charles 日期:2017-03-17 语言:Python 2.7 操作:输入quit退出阅读糗事百科 功能:按下回车依次浏览今日的糗百热点 --------------------------------------- """ print u'请按下回车浏览今日的糗百内容:' raw_input(' ') myModel = spiderModel() myModel.Start() ################################################################################################## #coding=utf-8 import re import urllib def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getImg(html): reg = r'src="(.+?\.jpg)" pic_ext' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'%s.jpg' % x) x+=1 #用到了urllib.urlretrieve()方法,直接将远程数据下载到本地。 return imglist html = getHtml("http://tieba.com/p/2460150866")#4877448324 print getImg(html) #reg = r'src="(.+?\.jpg)" pic_ext' #src=" #匹配src=" #(.+?\.jpg) # 括号表示分组,将括号的内容捕获到分组当中 # .+表示匹配至少一个任意字符,问号?表示懒惰匹配,也就是匹配尽可能少的字符串。 # .+?\.jpg合起来表示尽可能少匹配字符的匹配到.jpg,避免匹配范围超出src的范围 # 这个括号也就可以匹配网页中图片的url了 #" pic_ext #匹配" pic_ext #print re.escape('<div><class="content"><span>A</span></div>') #\<div\>\<class\=\"content\"\>\<span\>(.*?)(.*?)\<\/span\>\<\/div\> #从百度贴吧下图片 ################################################################################################## #coding=utf-8 import re import urllib def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getStat(html): reg = r'class="wea">(.*?)</p>' imgre = re.compile(reg) weatherlist = re.findall(imgre,html) return weatherlist html = getHtml("http://www.weather.com.cn/weather/101190101.shtml")#4877448324 weatherlist = getStat(html) for weather in weatherlist: print weather.decode('utf-8') #天气网查南京一周的天气状况初步 ################################################################################################## #coding=utf-8 import re import urllib def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getStat(html): reg = r'<span>(.*?)</span>/<i>(.*?)</i>' #reg = r'class="wea">(.*?)</p>' imgre = re.compile(reg) weatherlist = re.findall(imgre,html) return weatherlist html = getHtml("http://www.weather.com.cn/weather/101190101.shtml")#4877448324 weatherlist = getStat(html) for weather in weatherlist: print '今日最高温度:' print weather[0].decode('utf-8') print '今日最低温度:' print weather[1].decode('utf-8') #天气网查南京一周的温度 ##################################################################################################
python的小实验代码的备份,主要偏向于爬虫方向,难点是正则表达式和编码转换[入门]
最新推荐文章于 2024-08-01 11:08:18 发布