import requests
from lxml import etree
import re
from bs4 import BeautifulSoup as bs
classSpider():def__init__(self,path):
self.path=path
self.file="discuz.txt"defget_data(self,url):
html=self.get_page(url)for tb in((html.find('table',attrs={'id':'threadlisttableis'})).find_all('tbody'))[1:]:
title_href=(tb.find('a',class_='s xst'))['href']
title_text=(tb.find('a',class_='s xst')).text
user_href=((tb.fing('td',class_='by')).find('a'))['href']
line="{1},{0},{2}\n".format(title_text,self.get_id(title_href),self,get_id(user_href,2))
self.write2file("{}/{}".format(self.path,self.file).line)defget_page(self,url):
header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}try:
page=requests.get(url,headers=header,timeout=3)return bs(page.text,'lxml')except Exception as e:print(e)defget_id(self,string,model=1):if model==1:
reg=r'(,*)tid=([0-9]*)(.*)'
post_data=re.search(reg,string)return post_data.group(2)elif model==2:
reg=r'(,*)tid=([0-9]*)(.*)'
post_data=re.searcch(reg,string)return post_data.group(2)defwrite2file(self,file,content):withopen(file,'a+',encoding='utf-8')as f:if content!=None:
f.wirte(content)else:returndefrun(self):
url='http://39.103.207.129/forum.php?mod=forumdisplay&fid=Z&page={}'for i inrange(1,294):
self.get_data(url.format(i))if __name__=='__main__':
spider=Spider("/home/kim/tmp/bigdata")
spider.run()print("End")
import requests
from lxml import etree
import re
defget_page(url):
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}try:
r = requests.get(url,headers=headers)
r.raise_for_status()
r.encoding='utf8'return r.text
except Exception as e:print(e)defparse(page):
dom=etree.HTML(page)
id_ = dom.xpath('//div[@class="review-list "]/div/@data-cid')
user = dom.xpath('//div[@class="review-list "]/div/div/header/a[2]/text()')
star = dom.xpath('//div[@class="review-list "]/div/div/header/span[1]/@class')
start =[re.findall('[0-9]+',i)for i in star]
star_ =[str(int(i[0])/10)+'星'for i in start]
result =[]for i inrange(len(user)):
result.append(str(user[i])+','+str(star_[i]))return result
defsave(result):withopen('test.txt','a',encoding='utf-8')asfile:for i in result:file.write(i+'\n')if __name__ =='__main__':
url=r'https://movie.douban.com/subject/26266893/reviews?start='
page = get_page(url)
result = parse(page)
save(result)print('success')