import requests, time, re, os
import pandas as pd
def example():
if not os.path.exists('图片下载'):
os.mkdir('图片下载')
names = []
reviews = []
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'}
for page in range(1, 3):
########################获取网页源代码的字符串################################
url = f'https://book.douban.com/latest?subcat=%E5%85%A8%E9%83%A8&p={page}'
response = requests.get(url=url, headers=head)
code = response.text
########################获取网页源代码的字符串################################
########################解析网页中的名字################################
name_pattern = '<a class="fleft" href=.*?>(.*?)</a>'
name = re.findall(name_pattern, code)
names.extend(name)
########################解析网页中的名字################################
########################解析网页中的评价数################################
review_pattern = '<span class="fleft ml8 color-gray">\((.*?)人评价\)</span>'
review = re.findall(review_pattern, code)
reviews.extend(review)
########################解析网页中的评价数################################
# start_pattern = '<span class="font-small color-red fleft">(.*?)</span>'
# starts =re.findall(start_pattern,code)
# inf_pattern = '''<p class="subject-abstract color-gray">
# (.*?)
# </p>'''
# infs = re.findall(inf_pattern,code)
# # infs[0].split('/')
# #注意 括号的正确正则表达式 (为 \( ; )为 \)
# review_pattern = '<span class="fleft ml8 color-gray">\((.*?)人评价\)</span>'
# reviews = re.findall(review_pattern,code)
img_url_pattern = '<img class="subject-cover" align="left" src=(.*?)>'
img_urls = re.findall(img_url_pattern, code)
img_urls = [i.replace('"', '') for i in img_urls]
for i in range(len(name)):
na = name[i]
img_url = img_urls[i]
img_name = './' + '图片下载/' + na + '.jpg'
img_respone = requests.get(img_url, headers=head)
with open(img_name, mode='wb') as f:
f.write(img_respone.content)
time.sleep(4)
df = pd.DataFrame({'名字': names, '评论数': reviews})
df.to_csv('图书信息.csv', encoding='utf-8-sig', index=None)
if __name__ == '__main__':
example()
Pthon爬取
于 2022-10-11 13:35:22 首次发布