python是解释型 面向对象的语言
-
输入(返回值是str)
input('The input is:')
price = int(input('The price is:'))
一些例子:
eval可以把输入的字符串当作有效的表达式来求值。
-
注释# 续行\ 一行多句;
-
打开文件/文件读写
readlines()返回结果是一个列表,在输出结果中有换行符
如上图,将文件中的字符串加上序号:(要用str()函数把整形转成字符串才行)
在文件中移动指针
f.seek(offset,whence=0)
-
举个例子
import os
def countLines(fname):
try:
with open(fname) as f:
data = f.readlines()
except FileNotFoundError:
print(fname+'do not exist')
lens = len(data)
# print(fname.split('\\')[0])
# print(fname.split('\\')[1])
print(fname.split('\\')[1]+'has'+str(lens)+'lines')
path = './testdata'
for fname in os.listdir(path):
if fname.endswith('.txt'):
file_path = os.path.join(path, fname)
# print(file_path)
countLines(file_path)
-
python从网络上获取数据:抓取网页&解析网页内容
抓取:
1、Requests第三方库 很适合做中小型的网络爬虫的开发
2、scrapy非常流行的开源的爬虫框架 大型爬虫
抓取的过程就是 客户机发送一个请求Request给服务器;服务器会返回一个响应Response
拿到一个响应后要对它进行解析,现在流行的解析工具有Beautiful Soup库和正则表达式模块。
有的时候还可以通过第三方的API来更加方便的抓取和解析网页的内容。
request.get()
对应的就是http协议里的GET方法,请求获取指定URL位置
抓取之前 先看网站的爬虫协议 robots.txt 爬多个页面时 注意延时是多少
import requests
r = requests.get('https://www.baidu.com/img/bd_logo1.png')
with open('baidu.png', 'wb') as fp:
fp.write(r.content)
import requests
r = requests.get('https://weibo.com/u/5239099662')
print(r.status_code) #200
print(r.text)
解析:
有规范简单的标签进行标记的数据适合于 Beautiful Soup库解析
Beautiful Soup是一个很方便的HTML 和XML的解析器
数据结构在细节上面比较复杂的数据的更适合于使用正则表达式模块解析
import requests
from bs4 import BeautifulSoup
r = requests.get('https://weibo.com/u/5239099662')
soup = BeautifulSoup(r.text,'lxml')
pattern = soup.find_all('span','short')
#find_all返回的是一个列表
for item in pattern:
print(item.string)
正则表达式
(?贪婪 不加?非贪婪)
import requests, re, time
from bs4 import BeautifulSoup
count = 0
i = 0
s, count_s, count_del = 0, 0, 0
lst_stars = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
while count < 50:
try:
r = requests.get('https://book.douban.com/subject/1084336/comments/hot?p=' + str(i+1), headers = headers)
except Exception as err:
print(err)
break
soup = BeautifulSoup(r.text, 'lxml')
comments = soup.find_all('span', 'short')
pattern = re.compile('<span class="user-stars allstar(.*?) rating"')
p = re.findall(pattern, r.text)
for item in comments:
count += 1
if count > 50:
count_del += 1 # count the number of comments more than 50 of the page
else:
print(count, item.string)
for star in p:
lst_stars.append(int(star))
time.sleep(5) # delay request from douban's robots.txt
i += 1
for star in lst_stars[:-count_del]: # calculate the rating star of 50 comments
s += int(star)
if count >= 50:
print(s // (len(lst_stars)-count_del))
import requests
import re
def retrieve_dji_list():
r = requests.get('https://money.cnn.com/data/markets/nasdaq/')
# 先获取表头信息
head = re.findall('<thead>(.*?)</thead>', r.text)
assert len(head)==1
table_head = ['Company'] + re.findall('<th>(.+?)<', head[0])
# 先获取表的总体
tbody_pat = re.compile('tbody>(.*?)</tbody')
tbody = re.findall(tbody_pat, r.text)
assert len(tbody) == 1
# 再获取总体表中每一条记录
tr_pat = re.compile('<tr>(.*?)</tr>')
tr_list = re.findall(tr_pat, tbody[0])
# 最后获取每一条记录中的各个字段
table_pat = re.compile('>([^<^&]+?)<')
stock_list = [table_head]
for i in tr_list:
s = re.findall(table_pat, i)
stock_list.append(s)
return stock_list
dji_list = retrieve_dji_list()
print(dji_list)
import re
import requests
year = 2019
def crawler(url):
try:
r = requests.get(url)
except requests.exceptions.RequestException as err:
return err
r.encoding = r.apparent_encoding
# 一定要把下面这3行写在同一行上
pattern = re.compile('href="/en/vnl/%s/women/teams/.*?">(.*?)</a></figcaption>\s+</figure>\s+</td>\s+<td></td>\s+<td class=".*?">(.*?)</td>\s+<td class=".*?">(.*?)</td>\s+<td class=".*?">(.*?)</td>' % year)
p = re.findall(pattern, r.text)
return p
url = 'http://www.volleyball.world/en/vnl/%s/women/resultsandranking/round1' % year
result = crawler(url)
print(result)