python读写txt文件
打开文件,读写文件
读
with open("text.txt",'r',encodeing='utf-8') as f:
f.read() #把全部内容读出,用一个字符串返回
f.readline() #读出一行
f.readlines() #读出所有行,以列表的形式返回
写
with open("text.txt",'w',encodeing='utf-8') as f:
f.write("一大段文字")
f.writelines(['第一段话','第二段话','第三段话']) #把列表里的一起写入,但是不会自动换行
f.writelines(['第一段话\n','第二段话\n','第三段话']) #手动写入换行符号
python读写csv文件:
import csv
读
with open('douban.csv','r',encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
print(row)
username = row["username"] # csv读出的内容,可以相当于普通字典一样使用
print(username)
写
data = [{'电影':'肖生克', '评分':9.7,'引言':'像风一样自由'},{'电影':'肖生克2', '评分':10,'引言':'像风一样自222由'}]
with open('douban.csv','w',encoding='utf-8',newline="") as f: #newline=""加上这个写入文件不会有多的空行
writer = csv.DictWriter(f,fieldnames=['电影','评分','引言'])
writer.writeheader() #写入列名
writer.writerows(data) #写入内容
正则表达式用法
import re
#先获取页面的源代码
chapter_html = requests.get("https://www.kanunu8.com/book3/6879/").content.decode('gb2312')
#然后从源代码里使用正则表达式提取需要的内容
#findall第一个参数为正则表达式 findall会找出所有的符合条件的字符串
urls = re.findall('<a href="(.*?).html">第',chapter_html)
# research找到一个之后就会停止,引用时要带group(1),括号里的数字代表正则表达式里提取的第几个内容,因为正则表达式可以一次提取多个内容出来
content = re.search('<p>(.*?)</p>',content_html, re.S).group(1)
xpath用法
import requests
import lxml.html
#先获取页面内容 获取到的是字符串
source = requests.get('https://movie.douban.com/top250',headers = headers).content.decode()
#把内容解析出来 转为html
selector = lxml.html.fromstring(source)
#使用xpath来获取
info = selector.xpath('//div[@class ="info"]')
#还可以对xpath返回的对象继续使用xpath
title = info.xpath('div[@class="hd"]/a/span[@class="title"][1]/text()')
rating_num = info.xpath('div[@class="bd"]/div/span[@class="rating_num"]/text()')
quote = info.xpath('div[@class="bd"]/p/span/text()')
此外:
读取标签里面字标签的文字,使用string(.)
data = selector.xpath('//div[@id=text3]')[0]
info = data.xpath['string(.)']
xpath里属性的写法
//div[@属性='xx']/a #属性等于某个值
//div[starts-with(@id,'item')]/text() #属性以item开头
//div[contains(@id,'key')]/text() #属性包含key
两个简单的爬虫例子
爬取小说动物农场,并将章节里的内容保存在txt文件里 使用正则表达式
import requests
import re
import os
chapter_List = []
chapter_html = requests.get("https://www.kanunu8.com/book3/6879/").content.decode('gb2312')
urls = re.findall('<a href="(.*?).html">第',chapter_html)
for url in urls:
chapter_List.append(url)
# os.makedirs('动物农场', exist_ok=True)
for chapter in chapter_List:
content_html = requests.get("https://www.kanunu8.com/book3/6879/{}.html".format(chapter)).content.decode('gb2312')
title = re.search('(第.*?章)', content_html).group(1)
# content = re.findall(' (.*?)<br />', content_html, re.S)
content = re.search('<p>(.*?)</p>',content_html, re.S).group(1)
# print(content)
content = content.replace('<br />','')
# print(content)
with open('{}.txt'.format(title), 'w', encoding='utf-8',newline="") as f:
f.write(content)
爬取豆瓣250里第一页的数据,保存在csv文件里
import requests
import lxml.html
import csv
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Connection':'keep-alive',
'Host':'movie.douban.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
}
source = requests.get('https://movie.douban.com/top250',headers = headers).content.decode()
selector = lxml.html.fromstring(source)
info = selector.xpath('//div[@class ="info"]')
length = len(info)
data = []
for i in range(0,length):
title = info[i].xpath('div[@class="hd"]/a/span[@class="title"][1]/text()')
rating_num = info[i].xpath('div[@class="bd"]/div/span[@class="rating_num"]/text()')
quote = info[i].xpath('div[@class="bd"]/p/span/text()')
data.append({'电影':title[0], '评分':rating_num[0],'引言':quote[0]})
with open('douban.csv','w',encoding='utf-8',newline="") as f:
writer = csv.DictWriter(f,fieldnames=['电影','评分','引言'])
writer.writeheader()
writer.writerows(data)
爬取豆瓣250里第一页的数据:使用正则表达式的方法
import requests import re url = "https://movie.douban.com/top250" headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36" } r = requests.get(url=url,headers=headers) print(r.text) r = r.text """ 正则表达式 . 表示除空格外任意字符(除\n外) * 表示匹配字符零次或多次 ? 表示匹配字符零次或一次 .*? 非贪婪匹配 """ # 各个字段的正则表达式 parr_title = re.compile('img width="100" alt="(.*?)" src=') parr_href = re.compile('src="(.*?)" class=""') parr_rating_num = re.compile(' <span class="rating_num" property="v:average">(.*?)</span>') parr_rating_person = re.compile('<span>(.*?)人评价</span>') parr_direc_act = re.compile('导演: (.*?)<br>') datas = [] title = re.findall(parr_title, r) href = re.findall(parr_href, r) rating_num = re.findall(parr_rating_num, r) rating_person = re.findall(parr_rating_person, r) direc_act = re.findall(parr_direc_act, r) # 存到datas里 for i in range(len(title)): datas.append([title[i], href[i], rating_num[i], rating_person[i], direc_act[i]]) for data in datas: print(data)