因为课题研究需要搜集出版社名称,国外的出版社名称我打算从一个网站上获得。一共有223页,除最后一页外,其他每一页都是有10个出版社名称。Fn+F12,找到出版社名称所在的位置。然后开始进行编程!
直接上代码。
# coding:utf-8
import os
import requests
from bs4 import BeautifulSoup
def getinfo(soup):
names = []
name = soup.select('.item-name')
print(len(name))
for x in range(0, 8):
score = name[x].get_text()
names.append(score)
pages = soup.select('.page-link')
page = pages[len(pages) - 2]['href']
x, page = os.path.split(page)
return names, page
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'} # 请求头
page = ''
url = "https://www.publishersglobal.com/directory/media/book-publishers/"
while True:
ret = requests.get(url+page, headers=headers)
soup = BeautifulSoup(ret.text, 'lxml')
names, page = getinfo(soup)
print(page)
with open('F:/Experiment/scrapy/test.txt', 'a', encoding='utf-8') as f:
for name in names:
f.write(str(name)+'\n')
if 'Next' not in ret.text: # 下一页不在就表示下载完毕
print('所有出版社名称下载完毕')
break