Python3爬取One

最近发现一个One的app发现挺有趣的,所以就有了想要摘取的心思,但身为一个程序员,而且是学Python的,爬呗!
话不多说:直接上代码!

from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import requests, xlwt


class One(object):
    sheet = None
    workbook = None
    row = 1

    def __init__(self):
        self.root_url = 'http://wufazhuce.com'

    def get_headers(self):
        ua = UserAgent()
        headers = {
            'User-Agent': ua.random,
            'Cookie': 'RELEASE_KEY=; XIN_anti_uid=1019A720-5462-8AFF-7ACB-21B849C8B377; XIN'
        }
        return headers

    def get_urls(self):
        for x in range(20, 300):
            urls = self.root_url + '/one/' + str(x)
            self.get_data(urls)

    def get_data(self, url):
        try:
            response = requests.get(url, self.get_headers())
            if response.status_code == 200:
                # print(response.text)
                self.page_urls(response.text)
            else:
                print('请求页面状态码:', response.status_code)
                return None
        except Exception as e:
            print('请求页面异常:', e)
            return None

    def page_urls(self, html):
        if html:
            bs = BeautifulSoup(html, "lxml")
            img_urls = bs.select('#main-container .one-imagen img')
            titles = bs.select('.one-titulo')
            contents = bs.select('.one-cita-wrapper .one-cita')
            times = bs.select('.one-cita-wrapper .one-pubdate')
            for title in titles:
                title1 = title.text.strip()
            print(title1)
            for img_url in img_urls:
                url = img_url.get('src')
            print(url) # 图片地址以拿到
            for conten in contents:
                content = conten.text.strip()
            print(content)
            for tim in times:
                time = tim.text.strip()
            print(time)
            print('++++++++++++++++++++++++++++++++++')
            self.sheet.write(self.row, 0, title1)
            self.sheet.write(self.row, 1, url)
            self.sheet.write(self.row, 2, time)
            self.sheet.write(self.row, 3, content)
            self.row += 1

    def open_file(self):
        self.workbook = xlwt.Workbook(encoding='utf-8')
        self.sheet = self.workbook.add_sheet('One')
        self.sheet.write(0, 0, '编号')
        self.sheet.write(0, 1, '图片地址')
        self.sheet.write(0, 2, '日期')
        self.sheet.write(0, 3, '文章')

    def close_file(self):
        self.workbook.save('One.xls')


if __name__ == '__main__':
    one = One()
    one.open_file()
    one.get_urls()
    one.close_file()
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值