自学了一些python知识,参考网上别人的例子,成功爬取了oracle大佬博客的文章(希望大佬不要打我,我爬取到本地是为了学习用)
#coding:utf-8 import requests import re url='https://www.xifenfei.com/page/%d' pattern=re.compile(r'<h2 class="entry-title"><a href="(.*?)" title="(.*?)" rel="bookmark">(.*?)</a></h2>') headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'} cop = re.compile("[^\u4e00-\u9fa5^a-z^A-Z^0-9]") def write2filehtml(title,url): title=cop.sub('', title) finename="%s.html" % title with open(finename,'a',encoding='utf-8') as fp: response = requests.get(url=url,headers = headers) response.encoding = 'utf-8' content = response.text content=content.replace('href="//www.xifenfei','href="http://www.xifenfei') content=content.replace('src="//www.xifenfei','src="http://www.xifenfei') fp.write(content+'\n') def write2file(items): with open('./lhrbest_itpub_link_title1.txt','a',encoding='utf-8') as fp: for item in items: item=item[::-1] write2filehtml(str(item[0]),str(item[2])) def loadHtml(page): if page >= 1: for p in range(1,page+1): url_itpub = url%(p) response = requests.get(url=url_itpub,headers = headers) response.encoding = 'utf-8' content = response.text #print(content) # Ctrl + Alt + V:提取变量 items = pattern.findall(content) write2file(items) pass else: print('请输入数字!!!') if __name__ == '__main__': try: page = int(input('请输入需要爬取多少页:')) except: print('请输入数字') else: loadHtml(page)