导入模块
from lxml import etree
import time
from selenium import webdriver
采用面向对象编程结构
class WeiBo(object):
driver_path = r'E:\chromedriver.exe'
csv_name = 'bgsxy_allweibo.csv'
def __init__(self):
self.driver = webdriver.Chrome(executable_path=WeiBo.driver_path)
self.url = 'https://weibo.com/2803301701/ItRmzFAPJ'
self.comments = []
def parse_detail_page(self,source):
html = etree.HTML(source)
comments = html.xpath("//div[@class='list_box']//div[@class='WB_text']/text()")
comment_list = []
for comment in comments:
comment_list.append(comment)
print(comment_list)
print('开始写入txt文件')
a_str = '\n'.join(comment_list)
fo = open('1.txt', 'ab+')
fo.write((a_str).encode('utf-8'))
fo.close()
def run(self):
input('请在chrome中登录weibo.com')
self.driver.get(self.url)
time.sleep(6)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
for i in range(40):
time.sleep(5)
print('第%d次点击'%i)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
next_page = self.driver.find_element_by_xpath( "//div[@class='list_box']//span[@class='more_txt']")
time.sleep(5)
next_page.click()
time.sleep(5)
source = self.driver.page_source
self.parse_detail_page(source)
if __name__ == '__main__':
spider = WeiBo()
spider.run()
代码要完善的地方还很多,希望读者提出更多的批评建议