1 from selenium import webdriver 2 dirver = webdriver.Firefox() 3 dirver.get('https://music.douban.com/') 4 for i in dirver.find_elements_by_css_selector('.new-albums .album-title'): 5 print(i.text)
1 import requests 2 from lxml import html 3 # 创建 session 对象。这个对象会保存所有的登录会话请求。 4 session_requests = requests.session() 5 # 提取在登录时所使用的 csrf 标记 6 login_url = "https://bitbucket.org/account/signin/?next=/" 7 result = session_requests.get(login_url) 8 tree = html.fromstring(result.text) 9 authenticity_token = list(set(tree.xpath("//input[@name='csrfmiddlewaretoken']/@value")))[0] 10 payload = { 11 "username": "<你的用户名>", 12 "password": "<你的密码>", 13 "csrfmiddlewaretoken": authenticity_token # 在源代码中,有一个名为 “csrfmiddlewaretoken” 的隐藏输入标签。 14 } 15 # 执行登录 16 result = session_requests.post( 17 login_url, 18 data = payload, 19 headers = dict(referer=login_url) 20 ) 21 # 已经登录成功了,然后从 bitbucket dashboard 页面上爬取内容。 22 url = 'https://bitbucket.org/dashboard/overview' 23 result = session_requests.get( 24 url, 25 headers = dict(referer = url) 26 ) 27 # 测试爬取的内容 28 tree = html.fromstring(result.content) 29 bucket_elems = tree.findall(".//span[@class='repo-name']/") 30 bucket_names = [bucket.text_content.replace("n", "").strip() for bucket in bucket_elems] 31 print(bucket_names)
1 from bs4 import BeautifulSoup 2 import requests 3 4 class CSDN(object): 5 def __init__(self, headers): 6 self.session = requests.Session() 7 self.headers = headers 8 def get_webflow(self): 9 url = 'http://passport.csdn.net/account/login' 10 response = self.session.get(url=url, headers=self.headers) 11 soup = BeautifulSoup(response.text, 'html.parser') 12 lt = soup.find('input', {'name': 'lt'})['value'] 13 execution = soup.find('input', {'name': 'execution'})['value'] 14 soup.clear() 15 return (lt, execution) 16 def login(self, account, password): 17 self.username = account 18 self.password = password 19 lt, execution = self.get_webflow() 20 data = { 21 'username': account, 22 'password': password, 23 'lt': lt, 24 'execution': execution, 25 '_eventId': 'submit' 26 } 27 url = 'http://passport.csdn.net/account/login' 28 response = self.session.post(url=url, headers=self.headers, data=data) 29 if (response.status_code == 200): 30 print('正常') 31 else: 32 print('异常') 33 def func(self): 34 headers1={ 35 'Host':'write.blog.csdn.net', 36 'Upgrade-Insecure-Requests':'1', 37 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' 38 } 39 response=self.session.get(url='http://write.blog.csdn.net/postlist',headers=headers1,allow_redirects=False) 40 print(response.text) 41 if __name__ == '__main__': 42 headers = { 43 'Host': 'passport.csdn.net', 44 'Origin': 'http://passport.csdn.net', 45 'Referer':'http://passport.csdn.net/account/login', 46 'Upgrade-Insecure-Requests':'1', 47 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36', 48 } 49 csdn = CSDN(headers=headers) 50 account = '' 51 password = '' 52 csdn.login(account=account, password=password) 53 csdn.func()
1 #coding=utf-8 2 import requests 3 import re 4 import time 5 import json 6 from bs4 import BeautifulSoup as BS 7 import sys 8 9 headers = { 10 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 11 } 12 13 def Get_Movie_URL(): 14 urls = [] 15 for i in range(1,11): 16 # 第一页的URL是不一样的,需要另外进行处理 17 if i != 1: 18 url = "http://www.mtime.com/top/movie/top100/index-%d.html" % i 19 else: 20 url = "http://www.mtime.com/top/movie/top100/" 21 r = requests.get(url=url,headers=headers) 22 soup = BS(r.text,'lxml') 23 movies = soup.find_all(name='a',attrs={'target':'_blank','href':re.compile('http://movie.mtime.com/(\d+)/'),'class':not None}) 24 for m in movies: 25 urls.append(m.get('href')) 26 return urls 27 28 def Create_Ajax_URL(url): 29 movie_id = url.split('/')[-2] 30 t = time.strftime("%Y%m%d%H%M%S0368", time.localtime()) 31 ajax_url = "http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1&Ajax_RequestUrl=%s&t=%s&Ajax_CallBackArgument0=%s" % (url,t,movie_id) 32 return ajax_url 33 34 def Crawl(ajax_url): 35 r = requests.get(url=ajax_url,headers=headers) 36 if r.status_code == 200: 37 r.encoding = 'utf-8' 38 result = re.findall(r'=(.*?);',r.text)[0] 39 if result is not None: 40 value = json.loads(result) 41 42 movieTitle = value.get('value').get('movieTitle') 43 TopListName = value.get('value').get('topList').get('TopListName') 44 Ranking = value.get('value').get('topList').get('Ranking') 45 movieRating = value.get('value').get('movieRating') 46 RatingFinal = movieRating.get('RatingFinal') 47 RDirectorFinal = movieRating.get('RDirectorFinal') 48 ROtherFinal = movieRating.get('ROtherFinal') 49 RPictureFinal = movieRating.get('RPictureFinal') 50 RStoryFinal = movieRating.get('RStoryFinal') 51 print(movieTitle) 52 if value.get('value').get('boxOffice'): 53 TotalBoxOffice = value.get('value').get('boxOffice').get('TotalBoxOffice') 54 TotalBoxOfficeUnit = value.get('value').get('boxOffice').get('TotalBoxOfficeUnit') 55 print('票房:%s%s' % (TotalBoxOffice,TotalBoxOfficeUnit)) 56 print('%s——No.%s' % (TopListName,Ranking)) 57 print('综合评分:%s 导演评分:%s 画面评分:%s 故事评分:%s 音乐评分:%s' %(RatingFinal,RDirectorFinal,RPictureFinal,RStoryFinal,ROtherFinal)) 58 print('****' * 20) 59 60 def main(): 61 urls = Get_Movie_URL() 62 for u in urls: 63 Crawl(Create_Ajax_URL(u)) 64 65 # 问题所在,请求如下单个电影链接时时不时会爬取不到数据 66 # Crawl(Create_Ajax_URL('http://movie.mtime.com/98604/')) 67 68 if __name__ == '__main__': 69 main()
相关工具
链接: https://pan.baidu.com/s/1oEw_MsaAWcMx7NQII6jXYg 密码: e6b6
链接: https://pan.baidu.com/s/1fSppM-hK2x9Jk9RGqvRMqg 密码: 4q43