# @author : scott
# @site :
# @time : 2022/2/5 0:21
#导包
import urllib.request
from lxml import html
etree = html.etree
def create_request(page):
request = urllib.request.Request(url = url,headers = headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode("gbk")
return content
def download_namelist(content):
tree = etree.HTML(content)
#获取作品名字列表
works_list = tree.xpath('//div[@class="name"]/a[@title]//text()')
#print(list(set(work_list)))
return works_list
def download_authorlist(content):
tree = etree.HTML(content)
#获取作者名字列表
authors_list = tree.xpath('//div[@class="publisher_info"]/a[1]//text()')
#print(author_list)
return authors_list
if __name__ == '__main__':
star_page = int(input("请输入从第几页开始获取: "))
end_page = int(input("请输入到第几页结束获取: "))
i = 0
j = 0
with open(r"book_ranking_list.txt","w") as f:
for page in range(star_page,end_page+1):
# (1) 请求对象的定制
request = create_request(page)
# (2)获取网页的源码
content = get_content(request)
# (3)得到相应数据
works_list = download_namelist(content)
authors_list = download_authorlist(content)
#将数据整理
while(True):
#i如果不是最后一位
if i != len(works_list)-1:
#分情况 有的书名名字太长 有省略号
if works_list[i+1] == "...":
#写入作品名称和省略号
f.writelines(works_list[i])
f.writelines(works_list[i+1])
#写入作者
f.writelines("\t"+"作者: ")
f.writelines(authors_list[j])
f.writelines("\n")
i = i+2
j = j+2#authors_list中还有出版社我没有用
else:
f.writelines(works_list[i])
f.writelines("\t"+"作者: ")
f.writelines(authors_list[j])
f.writelines("\n")
i = i+1
j = j+2
else:
f.writelines(works_list[i])
f.writelines("\t"+"作者: ")
f.writelines(authors_list[j])
break
f.close()
爬虫的代码
于 2022-02-13 06:04:35 首次发布