python爬取新闻网站内容_Python3：爬取新浪、网易、今日头条、UC四大网站新闻标题及内容...-CSDN博客

from urllib importrequestimportrequestsimportjsonimporttimeimportmathimporthashlibimportrefrom bs4 importBeautifulSoupdefget_url(max_behot_time, AS, CP):

url= 'https://www.toutiao.com/api/pc/feed/?category=news_society&utm_source=toutiao&widen=1'\'&max_behot_time={0}'\'&max_behot_time_tmp={0}'\'&tadrequire=true'\'&as={1}'\'&cp={2}'.format(max_behot_time, AS, CP)returnurldefget_ASCP():

t=int(math.floor(time.time()))

e= hex(t).upper()[2:]

m=hashlib.md5()

m.update(str(t).encode(encoding='utf-8'))

i=m.hexdigest().upper()if len(e) != 8:

AS= '479BB4B7254C150'CP= '7E0AC8874BB0985'

returnAS,CP

n= i[0:5]

a= i[-5:]

s= ''r= ''

for o in range(5):

s+= n[o] +e[o]

r+= e[o + 3] +a[o]

AS= 'AL'+ s + e[-3:]

CP= e[0:3] + r + 'E1'

#print("AS:"+ AS,"CP:" + CP)

returnAS,CPdefdownload(title, news_url):#print('正在爬')

req =request.urlopen(news_url)if req.getcode() != 200:return0

res= req.read().decode('utf-8')#print(res)

pat1 = r'content:(.*?),'pat2= re.compile('[\u4e00-\u9fa5]+')

result1=re.findall(pat1,res)#print(len(result1))

if len(result1) ==0:return0print(result1)

result2=re.findall(pat2,str(result1))

result3=[]for i inresult2:if i not inresult3:

result3.append(i)#print(result2)

title = title.replace(':','')

title= title.replace('"','')

title= title.replace('|','')

title= title.replace('/','')

title= title.replace('\\','')

title= title.replace('*','')

title= title.replace('<','')

title= title.replace('>','')

title= title.replace('?','')

with open(r'D:\code\python\spider_news\Toutiao_news\society\\' + title + '.txt','w') as file_object:

file_object.write('\t\t\t\t')

file_object.write(title)

file_object.write('\n')

file_object.write('该新闻地址：')

file_object.write(news_url)

file_object.write('\n')for i inresult3:#print(i)

file_object.write(i)

file_object.write('\n')#file_object.write(tag.get_text())

#print('正在爬取')

defget_item(url):#time.sleep(5)

cookies = {'tt_webid': '6478612551432734221'}

wbdata= requests.get(url,cookies =cookies)

wbdata2=json.loads(wbdata.text)

data= wbdata2['data']for news indata:

title= news['title']

news_url= news['source_url']

news_url= 'https://www.toutiao.com' +news_urlprint(title, news_url)if 'ad_label' innews:print(news['ad_label'])continuedownload(title,news_url)

next_data= wbdata2['next']

next_max_behot_time= next_data['max_behot_time']#print("next_max_behot_time:{0}".format(next_max_behot_time))

returnnext_max_behot_timeif __name__ == '__main__':

refresh= 50

for x in range(0,refresh+1):print('第{0}次：'.format(x))if x ==0:

max_behot_time=0else:

max_behot_time=next_max_behot_time#print(next_max_behot_time)

AS,CP =get_ASCP()

url=get_url(max_behot_time,AS,CP)

next_max_behot_time= get_item(url)