普通格式保存
import requests
import re
from requests.exceptions import RequestException
import csv
'''
#csv的a+性质表示追加,这个和pandas的to_csv的mode='a'是一样的道理
csv_file = open(r'E:\vscode_code\爬虫测试\B站\Bzhan.csv', 'w', newline='', encoding='utf-8-sig') # 解决中文乱码问题
writer = csv.writer(csv_file)
#这个理论上应该用w模式打开,这里懒得改了
writer.writerow(['排名', '名称', '观看人数'])
'''
url = 'https://www.bilibili.com/ranking/all/0/0/3'
def get_one_page(url):
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
response = requests.get(url, headers = headers)
if response.status_code==200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile(r'class="title">(.*?)</a>.*?<div class="num">(.*?)</div>',re.S)
items = re.findall(pattern,html)
'''
for item in items:
#在这里给他分开读取
writer.writerow([item[0], item[1], item[2]])
#print(item)
'''
print(items)
def main():
html = get_one_page(url)
parse_one_page(html)
print('ok')
if __name__=='__main__':
main()
dict格式保存
import requests
import re
from requests.exceptions import RequestException
import pandas as pd
import csv
'''
#csv的a+性质表示追加,这个和pandas的to_csv的mode='a'是一样的道理
csv_file = open(r'E:\vscode_code\爬虫测试\B站\Bzhan.csv', 'w', newline='', encoding='utf-8-sig') # 解决中文乱码问题
writer = csv.writer(csv_file)
#这个理论上应该用w模式打开,这里懒得改了
writer.writerow(['排名', '名称', '观看人数'])
'''
url = 'https://www.bilibili.com/ranking/all/0/0/3'
def get_one_page(url):
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
response = requests.get(url, headers = headers)
if response.status_code==200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern1 = re.compile(r'target="_blank" class="title">(.*?)</a>',re.S)
pattern2 = re.compile(r'class="num">(.*?)</div>',re.S)
names = re.findall(pattern1,html)
nums = re.findall(pattern2,html)
print(len(names), len(nums))
data = {'names':names, 'nums':nums}
basic_data = pd.DataFrame.from_dict(data = data)
basic_data.to_csv(r'E:\vscode_code\爬虫测试\B站\Bzhan2.csv', index=False, header=True)
print(basic_data)
'''
for item in items:
#在这里给他分开读取
writer.writerow([item[0], item[1], item[2]])
#print(item)
'''
def main():
html = get_one_page(url)
parse_one_page(html)
print('ok')
if __name__=='__main__':
main()
多个排行
import requests
import re
from requests.exceptions import RequestException
import csv
i=[1,168,3,129,4,36,188,160,119,155,5,181]
for j in range(len(i)):
url = 'https://www.bilibili.com/ranking/all/{}/0/3'.format(i[j])
def get_one_page(url):
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
response = requests.get(url, headers = headers)
if response.status_code==200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
csv_file = open(r'E:\vscode_code\爬虫测试\B站\Bzhan3.csv', 'a+', newline='', encoding='utf-8-sig')
writer = csv.writer(csv_file)
pattern = re.compile(r'<div class="num">(.*?)</div>.*?class="title">(.*?)</a>.*?<i class="b-icon play"></i>(.*?)</span>',re.S)
items = re.findall(pattern,html)
for item in items:
writer.writerow([int(item[0])+100*j, item[1], item[2]])
csv_file.close()
def main():
html = get_one_page(url)
parse_one_page(html)
print('ok')
if __name__=='__main__':
main()