最近发现一个One的app发现挺有趣的,所以就有了想要摘取的心思,但身为一个程序员,而且是学Python的,爬呗! 话不多说:直接上代码! from fake_useragent import UserAgent from bs4 import BeautifulSoup import requests, xlwt class One(object): sheet = None workbook = None row = 1 def __init__(self): self.root_url = 'http://wufazhuce.com' def get_headers(self): ua = UserAgent() headers = { 'User-Agent': ua.random, 'Cookie': 'RELEASE_KEY=; XIN_anti_uid=1019A720-5462-8AFF-7ACB-21B849C8B377; XIN' } return headers def get_urls(self): for x in range(20, 300): urls = self.root_url + '/one/' + str(x) self.get_data(urls) def get_data(self, url): try: response = requests.get(url, self.get_headers()) if response.status_code == 200: # print(response.text) self.page_urls(response.text) else: print('请求页面状态码:', response.status_code) return None except Exception as e: print('请求页面异常:', e) return None def page_urls(self, html): if html: bs = BeautifulSoup(html, "lxml") img_urls = bs.select('#main-container .one-imagen img') titles = bs.select('.one-titulo') contents = bs.select('.one-cita-wrapper .one-cita') times = bs.select('.one-cita-wrapper .one-pubdate') for title in titles: title1 = title.text.strip() print(title1) for img_url in img_urls: url = img_url.get('src') print(url) # 图片地址以拿到 for conten in contents: content = conten.text.strip() print(content) for tim in times: time = tim.text.strip() print(time) print('++++++++++++++++++++++++++++++++++') self.sheet.write(self.row, 0, title1) self.sheet.write(self.row, 1, url) self.sheet.write(self.row, 2, time) self.sheet.write(self.row, 3, content) self.row += 1 def open_file(self): self.workbook = xlwt.Workbook(encoding='utf-8') self.sheet = self.workbook.add_sheet('One') self.sheet.write(0, 0, '编号') self.sheet.write(0, 1, '图片地址') self.sheet.write(0, 2, '日期') self.sheet.write(0, 3, '文章') def close_file(self): self.workbook.save('One.xls') if __name__ == '__main__': one = One() one.open_file() one.get_urls() one.close_file()
Python3爬取One
最新推荐文章于 2023-04-25 15:01:05 发布