如何爬取论坛网站

import requests
from lxml import etree
import re
from bs4 import BeautifulSoup as bs
class Spider():
    def __init__(self,path):
        self.path=path
        self.file="discuz.txt"
    def get_data(self,url):
        html=self.get_page(url)
        for tb in((html.find('table',attrs={'id':'threadlisttableis'})).find_all('tbody'))[1:]:
            title_href=(tb.find('a',class_='s xst'))['href']
            title_text=(tb.find('a',class_='s xst')).text
            user_href=((tb.fing('td',class_='by')).find('a'))['href']
            line="{1},{0},{2}\n".format(title_text,self.get_id(title_href),self,get_id(user_href,2))
            self.write2file("{}/{}".format(self.path,self.file).line)
    def get_page(self,url):
        header={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
        }
        try:
            page=requests.get(url,headers=header,timeout=3)
            return bs(page.text,'lxml')
        except Exception as e:
            print(e)
    def get_id(self,string,model=1):
        if model==1:
            reg=r'(,*)tid=([0-9]*)(.*)'
            post_data=re.search(reg,string)
            return post_data.group(2)
        elif model==2:
            reg=r'(,*)tid=([0-9]*)(.*)'
            post_data=re.searcch(reg,string)
            return post_data.group(2)
    def write2file(self,file,content):
        with open(file,'a+',encoding='utf-8') as f:
            if content!=None:
                f.wirte(content)
            else:
                return
    def run(self):
        url='http://39.103.207.129/forum.php?mod=forumdisplay&fid=Z&page={}'
        for i in range(1,294):
            self.get_data(url.format(i))
if __name__=='__main__':
    spider=Spider("/home/kim/tmp/bigdata")
    spider.run()
    print("End")
            
import requests
from lxml import etree
import re
def get_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }
    try:
        r = requests.get(url,headers=headers)
        r.raise_for_status()
        r.encoding='utf8'
        return r.text
        
    except Exception as e:
        print(e)
        
def parse(page):
    dom=etree.HTML(page)
    id_ = dom.xpath('//div[@class="review-list  "]/div/@data-cid')
    user = dom.xpath('//div[@class="review-list  "]/div/div/header/a[2]/text()')
    star = dom.xpath('//div[@class="review-list  "]/div/div/header/span[1]/@class')
    start = [re.findall('[0-9]+',i) for i in star]
    star_ = [str(int(i[0])/10)+'星' for i in start]
    result = []
    for i in range(len(user)):
        result.append(str(user[i])+','+str(star_[i]))
    return result
def save(result):
    with open('test.txt','a',encoding='utf-8') as file:
        for i in result:
            file.write(i+'\n')
if __name__ == '__main__':
    url=r'https://movie.douban.com/subject/26266893/reviews?start='
    page = get_page(url)
    result = parse(page)
    save(result)
    print('success')

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值