'''
scrape quotes from site www.passiton.com
'''
import requests, csv
from bs4 import BeautifulSoup
from threadpool import ThreadPool, makeRequests
def scrapePage(url):
global quotes
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0'}
r = requests.get(url, headers=headers, proxies={'http':"http://127.0.0.1:3128",'https':"http://127.0.0.1:3128"})
soup = BeautifulSoup(r.content, 'lxml')
table = soup.find('div', attrs={'id': 'all_quotes'})
for quoteItem in table.findAll('div', attrs={'class': 'col-6 col-lg-3 text-center margin-30px-bottom '
'sm-margin-30px-top'}):
quote = {}
quote['theme'] = quoteItem.h5.string
if quoteItem.img:
quote['lines'] = quoteItem.img['alt'].split(" #")[0]
quote['author'] = quoteItem.img['alt'].split(" #")[1]
else:
quote['lines'] = quoteItem.p.string
quote['author'] = quoteItem.small.string
quotes.append(quote)
if __name__ == '__main__':
urlbase = "https://www.passiton.com/inspirational-quotes?page="
urls = []
for i in range(1,124):
urls.append(urlbase+str(i))
quotes = []
tp = ThreadPool(10)
reqs = makeRequests(scrapePage, urls)
for req in reqs:
tp.putRequest(req)
tp.wait()
filename = 'inspirational_quotes.csv'
with open(filename, 'w', newline='', encoding='utf-8') as f: # Remeber to use encoding if there is any corrupted character.
w = csv.DictWriter(f, ['theme', 'lines', 'author'])
w.writeheader()
for quote in quotes:
w.writerow(quote)
Python Scraping 1
最新推荐文章于 2024-03-26 09:31:32 发布