#!/user/bin/env python
# coding=utf-8
# @Author: Holley
# @File: Baike1.py
# @Datetime: 4/12/2018 14:32
'''Description:
获取所有页面用户ID,性别,年龄,段子内容,图片,评论ID,最热评论
'''
import requests
import re
import csv
from bs4 import BeautifulSoup
from lxml import etree
class crawler(object):
def __init__(self):
self.base_URL = 'https://www.qiushibaike.com/hot/page/'
self.start_URL = 'https://www.qiushibaike.com/hot/'
self.CsvFileName = 'QiuShiBaiKe.csv'
# self.Data_list = []
self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'}
self.proxy = 'username:password@host:port'
self.proxies = {
'http': 'http://' + self.proxy,
'https': 'https://' + self.proxy,
}
# def get_urls(self, url):
# html = requests.get(url, proxies=self.proxies).text
# soup = BeautifulSoup(html, 'lxml')
# button = soup.find('ul', {'class': 'pagination'}).find_all('li')[-1]
# URL = button.find('a')['href']
# return self.base_URL + URL
def get_html(self, url):
html = requests.get(url, proxies=self.proxies).text
return html
def get_contents(self, html):
selector = etree.HTML(html)
Data_list = []
parts = selector.xpath('//*[@id="content-left"]/div')
# 将每一个段子部分分区,所需要的内容一一对应
for i in parts:
# print(type(i)) # <class 'lxml.etree._Element'>
soup = BeautifulSoup(etree.tostring(i), 'lxml')
string_soup = str(etree.tostring(i), encoding="utf-8")
# print(soup)
# 获取用户ID
try:
ID = soup.find('h2').string.strip()
# 获取用户性别
pattern = re.compile('</h2>.*?<div class="(.*?)">', re.S)
gender = re.search(pattern, string_soup).group(1)
Gender = gender.split('Gender ')[1].split('Icon')[0]
# 获取用户年龄
Age = soup.find('div', {'class': gender}).string
except AttributeError:
ID, Gender, Age = '匿名用户', ' ', ' '
# 获取内容
joke_div = soup.find('div', {'class': 'content'})
Joke = joke_div.find('span').getText().strip()
# 获取图片
jpg_div = soup.find('div', {'class': 'thumb'})
Jpg = ''
if jpg_div:
Jpg = jpg_div.find('img')['src'][2:]
# 获取评论
comment = soup.find('div', {'class': 'cmtMain'})
if comment:
# 获取最热评论ID
Comment_ID = comment.find('span', {'class': 'cmt-name'}).string.split(':')[0]
# 获取最热评论
Comment = comment.find('div', {'class': 'main-text'}).getText().strip().split('\n')[0]
else:
Comment_ID, Comment = ' ', ' '
Data = {
'ID': ID,
'Gender': Gender,
'Age': Age,
'Joke': Joke,
'Jpg': Jpg,
'Comment_ID': Comment_ID,
'Comment': Comment
}
Data_list.append(Data)
return Data_list
def write_csv(self, Data):
write_flag = True
with open(self.CsvFileName, 'a', encoding='utf-8') as csvfile:
fieldnames = ["ID", "Gender", "Age", "Joke", "Jpg", "Comment_ID", "Comment"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# 将CsvData中的数据循环写入到CsvFileName文件中
for items in Data:
writer.writerow(items)
if __name__ == '__main__':
base_URL = 'https://www.qiushibaike.com/hot/page/'
c = crawler()
for i in range(1, 14):
Start_URL = base_URL + str(i) + '/'
html = c.get_html(Start_URL)
Data = c.get_contents(html)
c.write_csv(Data)
爬取糗事百科段子
最新推荐文章于 2020-04-12 20:51:28 发布