python实例

Ayanha

已于 2024-01-09 14:35:14 修改

阅读量178

点赞数

分类专栏： python 文章标签： python

于 2020-02-20 10:28:30 首次发布

本文链接：https://blog.csdn.net/weixin_45052608/article/details/104397095

版权

python 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

实例1：获取文件内容

#打开文件
a=open('r'C:\Users\用户\Desktop\文件名.txt',mode='r'')

#获取以P开头的行数与字符数（法一）
count=0
word_sum=0
str_sum=0
for i in a:
	i=i.strip()#去除每一行两边的空格
	if i.startswith('p') or i.startswith('P'):
		print(i)#以p开头的行
		count=count+1
		word_sum=word_sum+len(i.split())
		str_sum=str_sum+len(i)
print('row number:',count)#以p开头的行数
print('word number:',word_sum)#以p开头的行的单词数（不含空格）
print('str number:',str_sum)#以p开头的行的字符数

#获取以P开头的行数与单词数（法二）
count=0
word_sum=0
for i in a:
	i=i.strip()
	if not i.startswith('p'):
		continue
	print(i)
	count=count+1
	word_sum=word_sum+len(i.split())
print('row number:',count)
print('word number:',word_sum)

#获取文件的所有字符数
b=a.read()
print(b)#所有字符
print(len(b))#字符数
print(len(b.split()))#单词数(不包含空格)
print(b.split()[:4])#文件第1-4个单词

实例2：获取发件人

#获取发文件email.txt（from 123@utc.cn bai 5 09:14:16 2019)的人
a=open(r'c:\users\用户名\desktop\email.txt',mode='r')
for i in a:
	i=i.rstrip()
	if not i.startswith('f'):
		continue#忽略不以f开头的行，开始下一行循环
	b=i.split()
	print(b[2])

实例3：word_count

#直接读取全文内容并创建dict，统计单词频次
test=open(r'C:\Users\xyy\Desktop\test.txt',mode='r')
word=dict()
for i in test.read().split():
    word[i]=word.get(i,0)+1
print('单词统计：\n',word)
print('单词统计项：\n',list(word.items()))
print('单词列表：\n',list(word.keys()))
print('单词数目列表：\n',list(word.values()))

#直接读取全文内容并创建dict，统计字符频次
test=open(r'C:\Users\xyy\Desktop\test.txt',mode='r')
str=dict()
for i in test.read():
    if i not in str:
        str[i]=1
    str[i]=str[i]+1
print('字符统计：\n',str)

#筛选最大频次的单词
test=open(r'C:\Users\xyy\Desktop\test.txt',mode='r')
word=dict()
for i in test.read().split():
    word[i]=word.get(i,0)+1
print('单词统计：\n',word)
words=None
word_count=0
print('频次最大的单词：')
for i,j in word.items():
    if j>=max(word.values()):
        words=i
        word_count=j
        print(words,word_count)
print('频次最大的最后一个单词：',words,word_count)

实例4：爬虫

requests库

#脚本-含解析（网页示例）
#url含参
import requests
from lxml.html import fromstring
def getdatabyrequests(url):
	try:
    	r=requests.get(url)#请求方法
    	print(r.status_code)#状态码
    	r.raise_for_status()#若状态不是200，则引发HTTPError异常
    	html=r.content.decode('UTF8')#字符编码
    	tree=fromstring(html)#转译
    	result=tree.xpath('//span[@class="bjh-p"]/text()')#解析，[]内为解析内容的属性
    	print(result)
    except:
    	print('发生异常')
if __name__ == "__main__":
    url="https://baijiahao.baidu.com/s?id=1627614008110090953&wfr=spider&for=pc"
    getdatabyrequests(url)

#params设参
import requests
from lxml.html import fromstring
def getdatabyrequests(url,keyword1,keyword2,keyword3):
    try:
    	data={"id":keyword1,"wfr":keyword2,"for":keyword3}
    	r=requests.get(url,params=data)#请求方法
    	print(r.status_code)
    	r.raise_for_status()
    	html=r.content.decode('UTF8')#字符编码
    	tree=fromstring(html)#转译
    	result=tree.xpath('//span[@class="bjh-p"]/text()')#解析，[]内为解析内容的属性
    	print(result)
    except:
    	print('发生异常')
if __name__ == "__main__":
    url="https://baijiahao.baidu.com/s"
    keyword1=1627614008110090953
    keyword2="spider"
    keyword3="pc"
    getdatabyrequests(url,keyword1,keyword2,keyword3)

#访问并保存图片
import requests
import os
def getdatabyrequests(url):
    root="C://pics//"#设置根目录
    path=root + 'a.' + url.split('.')[-1]#根目录加上文件名与扩展名即"C://pics//a.jpg"（若url以/区分的话，则以/拆分url）
    try:
        if not os.path.exists(root):
            os.mkdir(root)
        if not os.path.exists(path):
            r=requests.get(url)
            with open(path,'wb') as f:
                f.write(r.content)
                f.close()
                print('文件保存成功')
        else:
            print('文件已存在')
    except:
        print("发生异常")
if __name__ == "__main__":
    url="https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1584512259785&di=3abe7db90eb4e8ad5ee60229c72d7bc7&imgtype=0&src=http%3A%2F%2F211.159.149.56%3A8080%2Fmap%2F64%2F4o28b0625501ad13015501ad2bfc0045.jpg"
    getdatabyrequests(url)

#中国大学排名定向爬虫
#爬虫可行性监测：先确定网页源代码为html格式，然后查看robots协议
import requests
import bs4
from bs4 import BeautifulSoup
import re

#获取网页信息
def gethtmltext(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text#用return，保证作为参数赋值给html
    except:
        return '发生异常'
        
#解析网页，并将学校信息存入列表ulist中    
def fillunivlist(ulist,html):
    soup=BeautifulSoup(html,'html.parser')#将获取的url页面内容转换为html形式
    for tr in soup.find('tbody').children:#遍历每所大学所在的标签（每所大学的的标签tr在标签tbody中，而tbody是唯一的，所以可用它来定位大学所在标签位置，不需要另外指定标签属性）
        if isinstance(tr,bs4.element.Tag):#若tr标签不是bs4库定义的Tag类型，则过滤掉（tbody标签内可能存在文本和标签，我们只获取标签，不获取文本）
             tds=tr('td')#查找tr标签下的所有td标签,默认以列表展示（每所大学对应的信息都在td标签内，同tr.find_all('td')）
             ulist.append([tds[0].string,tds[1].string,tds[3].string])#将每所大学对应的信息即td标签存入列表中（只获取排名、大学名称、分数三个信息，存入这三个信息在列表中所在的位置就好）
 
#输出学校信息的列表，并指定学校个数   
def printunivlist(ulist,num):
    tplt='{0:^10}\t{1:{3}^10}\t{2:^10}'
    print(tplt.format('排名','大学名称','分数',chr(12288)))#设置表头
    for i in range(num):
        u=ulist[i]#将列表ulist中的每个元素（即每个大学的信息）赋给变量u
        print(tplt.format(u[0],u[1],u[2],chr(12288)))#输出每个学校的信息（为保证输出效果，我们需要用跟表头一致的字符串表示）
        
#主函数        
def main():
    ulist=[]
    url='http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html'
    html=gethtmltext(url)#将获取的url页面内容赋给变量html
    fillunivlist(ulist,html)#将html对应的信息存入uinfo列表中
    printunivlist(ulist,20)#输出uinfo列表，只输出20所学校的排名信息
main()

socket套接字

#客户端脚本（百度首页示例）
import socket
def getdatabysocket(url,port):
	mysock=socket.socket(socket.AF_INET,socket.SOCK_STREAM)#ipv4协议、TCP方式
	mysock.connect((url,port))#IP和端口必须是列表的形式
	print("建立连接的远程服务器地址：",mysock.getpeername())
	mysock.send('GET / HTTP/1.1\r\n'.encode('utf8'))#请求方法，/代表根目录
	mysock.send(("HOST:"+url+"\r\n").encode('utf8'))#请求头，即资源的主机IP与端口，HTTP1.1版本开始，请求必须使用HOST
	mysock.send(("\n").encode('utf8'))
	data=[]
	while True:
		try:
			d=mysock.recv(1024)
			print("读取数据数量：",len(d))
			if d:#若能正常接收到d即d不为空
				data.append(d)
			else:
				print("数据读取完毕")
				break
			result=b''.join(data)#列表data返回bytes格式
			print(result)
		except:
			print("发生异常")
			break
	mysock.close()#关闭连接
if __name__ == "__main__":
	url="www.baidu.com"
	port=80
	getdatabysocket(url,port)

urllib

#脚本（百度首页示例）
import urllib.request,urllib.parse,urllib.error
def getdatabyurllib(url):
    response=urllib.request.urlopen(url)
    print(type(response))#响应类型
    #print(response.read().decode().strip())#直接读取并输出网页内容
    for line in response:#或者按照每行输出
        print(line.decode().strip())#读取每一行，并去掉前后空格
if __name__ == "__main__":
    url="http://www.baidu.com"#""必须有
    getdatabyurllib(url)

#脚本（文本网页示例）
import urllib.request,urllib.parse,urllib.error
def getdatabyurllib(url):
	response=urllib.request.urlopen(url)
	counts=dict()
	for line in  response:
		words=line.decode().split()#将每一行文本转换为列表，默认以空格区分元素
		for word in words:
			counts[word]=counts.get(word,0)+1#往字典中添加每个元素的频次
	print(counts)
if __name__ == "__main__":
	url="文本网页路径"
	getdatabyurllib(url)

Ayanha

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
python实例

1、实例一#打开文件a=open('r'C:\Users\用户\Desktop\文件名.txt',mode='r'')#获取以P开头的行数与字符数（法一）count=0word_sum=0for i in a: i=i.strip()#去除每一行两边的空格 if i.startswith('p'): print(i)#以p开头的行 count=count+1 word...
复制链接

扫一扫

专栏目录