欢迎大家关注我的微信公众号!
名称:爬虫与地理信息
一、爬取思路分析
1.URL链接地址分析
首先打开豆瓣图书Top250主页:https://book.douban.com/top250,鼠标滑到页面下方可以看见有页面选项,我们通过点击不同页面即可发现url链接地址的变化规律。
查阅前三页的url分别如下,可以发现一直变化的为最后一个参数start,其值的变化规律为25*(n-1),n为页数,依此我们可以构造出所有的url。
2.网页源码分析
鼠标右击检查或按F12打开调试页面,通过定位我们发现书籍信息位于tr标签内,每一本书对应一个tr标签。因此,我们的爬取思路是:首先获取所有的tr标签,然后通过在tr标签中定位查找每本书的详细信息即可。
接下来,以获取书籍作者、出版社等信息为例来介绍各项信息的定位查找,如下图所示:作者等信息位于tr标签下的一个p标签里面。我们分别采用Xpath和BeautifulSoup两种方法进行定位,Xpath路径为://tr//p[@class=“pl”]//text(),BeautifulSoup路径为:tr.find(‘p’,class_=‘pl’).string。其他封面照片、书名、评分、短评等信息可以类似进行定位查找。
二、核心代码
涉及知识点:requests、html、Xpath、BeautifulSoup、csv。
1.获取网页内容
def getHtml(url):
user_agent = random.choice(USER_AGENTS)
headers = {
'User-Agent': user_agent,
#'Cookie':' '
}
try:
response = requests.get(url, headers=headers, timeout=5)
response.raise_for_status()
# text属性类型为string
text = response.text
# print(text)
return text
except:
print("爬取失败!")
2.获取书籍信息
Xpath方法:
def parse_page_Xpath(text):
html = etree.HTML(text)
book_info_list = []
trs = html.xpath("//table/tr")
for tr in trs:
name = tr.xpath('./td/div/a/@title')[0].strip()
img = tr.xpath("./td//a/img/@src")[0].strip()
writer_line = tr.xpath("./td//p[@class='pl']//text()")[0].replace(" ", "").split("/")[0]
if ']' in writer_line:
writer = writer_line.split(']')[1]
country = writer_line.split(']')[0].replace("[","").replace("]","")
else:
writer = writer_line
country = r"中国"
score = tr.xpath("./td//span[@class='rating_nums']/text()")[0].strip()
nums = tr.xpath("./td//span[@class='pl']/text()")[0].replace('(','').replace(')','').strip()
comment_list = tr.xpath("./td/p/span[@class='inq']/text()")
if len(comment_list) > 0:
comment = comment_list[0].strip()
else:
comment = ""
#book_info = "《" +name+ "》" + "" + writer + "" + score + "" + nums + "" + comment + "" + img + "\n"
book_info = "《" + name + "》" + ',' + writer + ',' + country + ',' + score + ',' + nums + ',' + comment + ',' + img
book_info_list.append(book_info)
#print(book_info)
return book_info_list
BeautifulSoup方法:
def parse_page_BS(text):
book_info_list = []
soup = BeautifulSoup(text, 'lxml')
trs = soup.find_all('tr', class_='item')
for tr in trs:
name = tr.find('div', class_='pl2').find('a')['title']
img_src = tr.find('img')['src']
writer_line = tr.find('p', class_='pl').string.strip()
if ']' in writer_line:
writer = writer_line.split(']')[1]
country = writer_line.split(']')[0].replace("[","").replace("]","")
else:
writer = writer_line
country = r"中国"
score = tr.find('span', class_='rating_nums').string.strip()
nums = tr.find('span', class_='pl').string.strip().replace('(', '').replace(')', '').strip()
comment_line = tr.find('span', class_='inq')
if comment_line is None:
comment = ""
else:
comment = comment_line.string
book_info = "《" + name + "》" + ',' + writer + ',' + country + ',' + score + ',' + nums + ',' + comment + ',' + img_src
book_info_list.append(book_info)
#print(book_info)
return book_info_list
3.下载封面照片
def save_image(i, book_info_list):
index = i * 25 + 1
for line in book_info_list:
img_url = line.split(",")[-1]
img_name = str(index) +"." + line.split(",")[0].strip().replace("《","").replace("》","") + ".jpg"
print(img_name + " " + img_url)
index += 1
photo_dir = "豆瓣图书排行榜250\\"
if not os.path.exists(photo_dir):
os.makedirs(photo_dir)
img_full_name = os.path.join(photo_dir, img_name)
#time.sleep(1)
try:
user_agent = random.choice(USER_AGENTS)
headers = {
'User-Agent': user_agent
}
pic = requests.get(img_url, headers=headers)
if not os.path.exists(img_full_name):
with open(img_full_name, 'wb') as f:
f.write(pic.content)
f.close()
print(img_url + ":图片下载成功!")
except:
print(img_url + ":图片下载失败!")
4.保存到文件
保存txt文件
def save_txt(i, book_info_list, fname):
print("开始写入文件......")
index = i * 25 + 1
with open("./豆瓣图书250.txt", 'a+', encoding='utf-8') as f:
for line in book_info_list:
line = str(index) + " " + line + "\n"
f.write(line)
index += 1
print("写入文件完成!")
保存csv文件
def save_csv(i, book_info_list, fname):
print("开始写入文件......")
index = i * 25 + 1
f = open(fname, 'a+', encoding='utf-8', newline="")
csv_writer = csv.writer(f)
#csv_writer.writerow(["序号","书名", "作者", "国籍", "评分", "人数", "评语","封面图片地址"])
for line in book_info_list:
line = str(index) + "," + line
print(line)
index += 1
csv_writer.writerow(line.split(','))
f.close()
print("写入文件完成!")
5.爬虫主函数
def spider(page_nums):
print("爬虫开始......")
url = "https://book.douban.com/top250?start={}"
for i in range(0,page_nums):
full_url = url.format(i * 25)
# print(full_url)
print("爬取第 "+ str(i+1) +" 页")
tag = random.randint(1,3)
time.sleep(tag)
text = getHtml(full_url)
#Xpath解析方法
#book_info_list = parse_page_Xpath(text)
#BS解析方法
book_info_list = parse_page_BS(text)
fname = "./豆瓣图书250.csv"
#保存txt文件
#save_txt(i, book_info_list, fname)
# 保存CSV文件
#save_csv(i, book_info_list, fname)
save_image(i, book_info_list)
print("爬虫结束!")