request和bs4
正则解析贝壳租房
response = requests.get('https://cd.zu.ke.com/zufang')
result = response.text
names = findall(r'(?s)<a class="twoline".+?>(.+?)</a>',result)
names = [x.strip() for x in names]
prices = findall(r'span class="content__list--item-price"><em>(\d+)</em>',result)
house = map(lambda i1,i2:(i1,i2),names,prices)
f = open('files/租房.csv','a',encoding='utf-8',newline='')
writer = csv.writer(f)
writer.writerow(['名称', '价格'])
writer.writerows(list(house))
f.close()
图片下载
import requests
def download_image(url:str):
response = requests.get('url')
date = response.content
with open('files/qx.jpeg','wb') as f:
f.write(date)
if __name__ == '__main__':
download_image('https://p2.itc.cn/images01/20220811/6d640bcab37e4e5ea17e7c747f8f4a53.jpeg')
图片批量下载
import requests
from re import findall
from uuid import uuid1
def download_image(url:str):
response = requests.get(url)
with open(f'files/{uuid1()}.jpg','wb') as f:
f.write(response.content)
response = requests.get('https://cd.zu.ke.com/zufang')
content = response.text
all_images = findall(r'(?s)<a\s+class="content__list--item--aside".+?>\s+<img.+?data-src="(.+?)"', content)
for x in all_images:
download_image(x)
浏览器伪装
import requests
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
response = requests.get('https://movie.douban.com/top250',headers=headers)
print(response.text)
bs4数据解析
"""
专门用来解析网页数据的第三方库。(基于css选择器解析网页数据)
这个库下载的时候用'beautifulsoup4',使用的时候用'bs4'
注意:在使用bs4做数据解析的时候,需要依赖'lxml'这个第三方库
"""
from bs4 import BeautifulSoup
html = open('files/test.html',encoding='utf-8').read()
soup = BeautifulSoup(html,'lxml')
result = soup.select('#box1 p')
print(result)
result = soup.select_one('#box1 p')
print(result)
result = soup.select('p')
print(result)
box2 = soup.select_one('#box2')
result = box2.select('p')
print(result)
p = soup.select_one('p')
img = soup.select_one('img')
print(p.text)
print(img.attrs['src'], img.attrs['alt'])
bs4解析豆瓣电影单页数据
import csv
from bs4 import BeautifulSoup
import requests
def get_net_date(url:str):
Headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=Headers)
return response.text
def analysis_data(html: str):
soup = BeautifulSoup(html,'lxml')
all_film_div = soup.select('.grid_view>li>.item')
all_data = []
for div in all_film_div:
name = div.select_one('.title').text
info = div.select_one('.bd>p').text.strip().split('\n')[-1].strip()
time,country,category = info.split('/')
score = div.select_one('.rating_num').text
comment_count = div.select('.star>span')[-1].text[:-3]
intro = div.select_one('.inq').text
all_data.append([name,time.strip(),country.strip(),category.strip(),score,comment_count,intro])
f = open('files/豆瓣电影第一页数据.csv','w',encoding='utf-8',newline='')
writer = csv.writer(f)
writer.writerow(['电影名', '上映时间', '国家', '类型', '评分', '评论数', '简介'])
writer.writerows(all_data)
if __name__ == '__main__':
result = get_net_date('https://movie.douban.com/top250')
analysis_data(result)
豆瓣电影
import csv
from bs4 import BeautifulSoup
import requests
def get_net_date(url: str):
Headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=Headers)
return response.text
def analysis_data(html: str):
soup = BeautifulSoup(html, 'lxml')
all_film_div = soup.select('.grid_view>li>.item')
all_data = []
for div in all_film_div:
name = div.select_one('.title').text
info = div.select_one('.bd>p').text.strip().split('\n')[-1].strip()
info_list = info.split('/')
time = info_list[0]
country = info_list[-2]
category = info_list[-1]
score = div.select_one('.rating_num').text
comment_count = div.select('.star>span')[-1].text[:-3]
intro_span = div.select_one('.inq')
if intro_span:
intro = intro_span.text
else:
intro = ''
all_data.append([name, score, time.strip(), country.strip(), category.strip(), comment_count, intro])
writer.writerows(all_data)
print('保存成功!')
if __name__ == '__main__':
f = open('files/豆瓣电影.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(f)
writer.writerow(['电影名', '评分', '上映时间', '国家', '类型', '评论数', '简介'])
for page in range(0, 251, 25):
url = f'https://movie.douban.com/top250?start={page}&filter='
result = get_net_data(url)
analysis_data(result)