import requests import re from urllib.parse import urlencode from requests.exceptions import RequestException import json from bs4 import BeautifulSoup import codecs from conm import *
header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', } #定义获取内容的方法 def get_page_index(offset,keyword): data = { 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': 20, 'cur_tab': 3, } #合成url url = 'http://www.toutiao.com/search_content/?' + urlencode(data) #对请求做些异常处理 try: response = requests.get(url,headers = header) if response.status_code == 200: return response.text return None except RequestException: print('请求索引页出错') return None
def parse_page_index(html): data = json.loads(html) if data and 'data