import requests from requests.exceptions import RequestException import json from urllib.parse import urlencode from bs4 import BeautifulSoup import os from hashlib import md5 import re def get_page(url, data): try: response = requests.get(url, data) if response.status_code == 200: return response.text else: return None except RequestException as e: return e def parse_page(html): data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') """ def get_page_num(html): data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('gallary_image_count') def generate_page(de_url, num): # 此处是生成图片网页代码 但并不是图片原网址 无法下载图片源码 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \ (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36' } group_url = list() pages = num + 1 # 索引由1开始 +1 索引值 try: response = requests.get(de_url, headers = headers) if response.status_code == 200: html = response.text soup = BeautifulSoup(html, 'lxml') title = soup.select('title')[0].get_text() except RequestException as e: return e for page in range(1, pages): url = de_url + str("#p=") + str(page) group_url.append(url) for group in group_url:download_images(group) return { 'title': title, 'Images_url': group_url } """ def get_page_detail(de_url): # 获取网站源码 用于解析组图网址 requ = requests.session() headers = { 'User-Agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 63.0.3239.108Safari / 537.36', 'Remote Address': '153.3.235.87:443', 'Referrer Policy': 'no - referrer - when - downgrade' } # 由于网页重定向至新的https网址: 这用替换方法 # url = 'https://www.toutiao.com/group/6526518758926713347/' # pattern = re.compile('(.*)') # result = re.search(pattern, de_url) # url = result.group(1).replace('http://', 'https://www.') # 替换 # 使用requests重定向方法: try: redirection = requ.head(de_url, allow_redirects=True) response = requ.get(redirection.url, allow_redirects=False, headers=headers) # 这里禁止重定向 获取header中url print(response.url) url = response.headers['location'] resp = requ.get(url) if resp.status_code == 200: return resp.text else: print("请求失败") return None except RequestException as e: print(e) def parse_page_detail(html, de_url): # 获取组图子图网址 soup = BeautifulSoup(html, 'lxml') title = soup.select('title')[0].get_text() images_pattern = re.compile('mediaInfo:.*?gallery: JSON.parse.*?\"(.*)\".*?siblingList', re.S) # () 转义存在问题 result = re.search(images_pattern, html) if result: test = re.sub(r'\\"', r'"', result.group(1)) # 以 r 开始表示不转义 re_url = re.sub(r'\\/', r'/', test) # 替换得到 \/ 难以理解 !----------------------------->插眼 # url_pattern = re.compile(r'http:.*?/.*?/(.*?)(com).*?/(origin).*?/(.*?)\"') # re_url = re.findall(url_pattern, result.group(1)) # print(r'\\\') python中字符串不能以 \ 结尾 # data = json.loads(test, encoding='utf-8') # 碰到一个问题: # son.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) # 解释说是,JSON字符串中,不能包含单引号,而必须是双引号 # 参考解决方案https://www.crifan.com/python_json_loads_valueerror_expecting_property_name/ # 部分源码: """ {\"count\":7,\"sub_images\":[{\"url\":\"http:\\/\\/p1.pstatp.com\\/origin\\/66b20003be4611dea592\\ ",\"width\":800,\"url_list\":[{\"url\":\"http:\\/\\/p1.pstatp.com\\\ /origin\\/66b20003be4611dea592\"},{\"url\":\"http:\\/\\/pb3.pstatp.com\\\ /origin\\/66b20003be4611dea592\"},{\"url\":\"http:\\/\\/pb9.pstatp.com\\\ /origin\\/66b20003be4611dea592\"}],\"uri\":\"origin\\/66b20003be4611dea592\",\ \"height\":1186 """ # 错误原由:由于\将”转义 而json 中key必须用双引号括起 故而报错 # 解决方案: 利用r 加 \替换 data = json.loads(re_url) if data and 'sub_images' in data.keys(): # 如果 data 不为空 且‘sub_images’在 data.keus()中 则为真 sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: download_images(image) return { "title": title, 'de_url': de_url, 'images': images } def download_images(url): # 下载图片代码 print('正在下载...', url) try: response = requests.get(url) if response.status_code == 200: save_images(response.content) return None except RequestException: return None def save_images(content): # 保存图片 file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') # os.getcwd 返回当前文件路径 if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close() print('保存成功...') def main(offset, keyword): data = { 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': 20, 'cur_tab': 3, 'from': 'gallery' } url = 'https://www.toutiao.com/search_content/?' + urlencode(data) # urlencode 将dict类型转换为str html = get_page(url, data) url = parse_page(html) for de_url in url: text = get_page_detail(de_url) print(text) parse_page_detail(text, de_url) if __name__ == '__main__': main(0, '街拍')
python爬虫:分析Ajax请求爬取今日头条街拍图
最新推荐文章于 2020-04-09 12:41:49 发布