最近,实验室的老师要求写一个爬虫 爬取安智市场的一些应用 我把经历记录在这里 包括 每一次的过程
便于自己记忆 也帮助其他有需要的人 相信以后能写出更好的代码(如果之后有时间code review
【urllib库编写】
简单 甚至可以说 弱智 但是对入门者 很好理解
可以爬取静态网页 根据domin的变换(手动)爬取更多网页
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Jul 11 15:43:00 2018 4 5 @author: LeonardoWang 6 """ 7 8 import re 9 import urllib 10 import urllib.request 11 12 url = "http://m.anzhi.com/top_1.html " 13 page = urllib.request.urlopen(url) 14 html = page.read() 15 html = html.decode('utf-8') 16 17 #print(html) 18 reg_down = r'href="(.*?)">极速下载</a>' 19 reg_name = r'<h4>(.*?)</h4>' 20 21 item_link=[] 22 link = re.compile(reg_down) 23 linklist = re.findall(link, html) 24 25 item_name=[] 26 name = re.compile(reg_name) 27 namelist = re.findall(name, html) 28 #print(namelist) 29 #print(namelist) 30 31 #urllib.request.urlretrieve('http://m.anzhi.com/download.php?softid=3001793','今日头条.apk') 32 33 for n in linklist: 34 item_link.append("http://m.anzhi.com/"+n) 35 # print(item_link) 36 x=0 37 for a in item_link: 38 print(a) 39 urllib.request.urlretrieve(a,'D:\\apk\\%s.apk' % namelist[x]) 40 print("done") 41 x+=1 42 print("Finished") 43 # ============================================================================= 44 # for i in range(5): 45 # print(namelist[i]) 46 # ============================================================================= 47 # print (item_name)
以上算很基础的demo 基本上是urllib re 库的简单实践 有了上面这个例程的基础 下面这个例程就容易看很多了
【用Selenium写 】
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Thu Jul 26 17:14:27 2018 4 5 @author: LeonardoWang 6 """ 7 8 from selenium import webdriver 9 import time 10 import re 11 from pyquery import PyQuery as pq 12 from selenium.webdriver.common.keys import Keys 13 import json 14 from lxml import etree 15 16 17 18 def GetDownLink(): 19 url = 'http://m.anzhi.com/top_1.html' 20 browser = webdriver.Chrome() #打开浏览器 21 browser.get(url) #进入相关网站 22 html=browser.page_source #获取网站源码 23 try: 24 while(html.find("没有更多")): 25 time.sleep(0.1) 26 browser.find_element_by_xpath('//*[@id="az_more"]/a').send_keys(Keys.ENTER) 27 except: 28 print ('********************************') 29 30 31 apk_crawl=browser.page_source #获取网站源码 32 33 dic={} 34 reg_down = r'href="(.*?)">极速下载</a>' 35 item_link=[] 36 link = re.compile(reg_down) 37 linklist = re.findall(link, apk_crawl) 38 for n in linklist: 39 item_link.append("http://m.anzhi.com/"+n) 40 41 reg_name = r'<h4>(.*?)</h4>' 42 name = re.compile(reg_name) 43 namelist = re.findall(name, apk_crawl) 44 x=0 45 for i in range(0,len(namelist)): 46 dic.setdefault(namelist[i],[]).append(item_link[i]) 47 x+=1 48 print("爬取条数%d"%x) 49 browser.close() 50 jsObj = json.dumps(dic) 51 with open("D:\\DataTestPython\\Data\\apk.json","w") as f: 52 f.write(jsObj) 53 f.close() 54 return dic 55 56 57 58 def AnalysisUrl(Name_link): 59 url = 'https://www.virustotal.com/#/home/url' 60 # 后台运行 61 # ============================================================================= 62 # # option = webdriver.ChromeOptions() 63 # # option.add_argument("headless") 64 # # browser = webdriver.Chrome(chrome_options=option) 65 # ============================================================================= 66 67 68 # 前端运行 69 browser = webdriver.Chrome() #打开浏览器 70 # html=browser.page_source #获取网站源码 71 browser.get(url) #进入相关网站 72 time.sleep(2) 73 browser.find_element_by_xpath('//*[@id="urlScan"]/div[@id="omnibar"]/span[@id="searchBar"]/input').send_keys(Name_link) 74 browser.find_element_by_xpath('//*[@id="urlScan"]/div[@id="omnibar"]/span[@id="searchBar"]/input').send_keys(Keys.ENTER) 75 76 time.sleep(3) 77 result_html=browser.page_source 78 selector = etree.HTML(result_html) 79 dic_analyRes={} 80 num = selector.xpath('//*[@id="pages"]/vt-result-url/div/vt-result-header/section/header/div[1]/h1//text()') 81 data_num = "".join([each for each in num]) 82 83 while(len(data_num)<4): 84 time.sleep(3) 85 result_html=browser.page_source 86 selector = etree.HTML(result_html) 87 dic_analyRes={} 88 num = selector.xpath('//*[@id="pages"]/vt-result-url/div/vt-result-header/section/header/div[1]/h1//text()') 89 data_num = "".join([each for each in num]) 90 91 dic_analyRes.setdefault('Total',[]).append(data_num) 92 num_dete=selector.xpath('//*[@id="detections"]/div') 93 for i in range(1,len(num_dete)+1): 94 obj= selector.xpath('//*[@id="detections"]/div[%s]/p[1]/span//text()'%i) 95 data_obj = "".join([each for each in obj]) 96 rea= selector.xpath('//*[@id="detections"]/div[%s]/p[2]/span//text()'%i) 97 data_rea = "".join([each for each in rea]) 98 dic_analyRes.setdefault(data_obj,[]).append(data_rea) 99 100 time.sleep(1) 101 result_detail=result_html.replace("detection","details") 102 selector = etree.HTML(result_detail) 103 time.sleep(2) 104 content=selector.xpath('//*[@id="content"]/vt-expandable-entry') 105 for each in range(1,len(content)): 106 detail_name= selector.xpath('//*[@id="content"]/vt-expandable-entry[%s]/section/h4//text()'%i) 107 list_detail_name = "".join([each for each in detail_name]) 108 detail_detail= selector.xpath('//*[@id="content"]/vt-expandable-entry[%s]/section/div/text()'%i) 109 list_detail_detail = "".join([each for each in detail_detail]) 110 dic_analyRes.setdefault(list_detail_name,[]).append(list_detail_detail) 111 112 detail_header= selector.xpath('//*[@id="content"]/vt-expandable-entry[6]/section/h4//text()') 113 list_detail_header = "".join([each for each in detail_header]) 114 detail_header_content = selector.xpath('//*[@id="content"]/vt-expandable-entry[6]/section/vt-code-block/pre/text()') 115 list_detail_header_content = "".join([each for each in detail_header_content]) 116 dic_analyRes.setdefault(list_detail_header,[]).append(list_detail_header_content) 117 118 browser.close() 119 return dic_analyRes 120 121 122 if __name__ == "__main__": 123 dic_Name_link={} 124 dic_Name_link=GetDownLink() 125 Name=list(dic_Name_link.keys()) 126 Name_link=list(dic_Name_link.values()) 127 128 dic={} 129 x=0 130 for link in Name_link: 131 dic=AnalysisUrl(link) 132 print(Name[x]) 133 x+=1 134 print(dic) 135 jsObj = json.dumps(dic) 136 with open("D:\\DataTestPython\\Data\\%s.json"%Name[x],"w") as f: 137 f.write(jsObj) 138 f.close() 139 print("Finish!")
一步到位
包括网页apk的爬取,还有送入检测网站自动化过程。 那个网站搜起来很慢(如果他的库里面没有的话) 所以 合适的sleep time 很重要
然后还有一个 处理 html的时候 xpath 真的好用 别用什么傻吊正则 我不是说正则不厉害 只能说 术业有专攻 每个人都有擅长的地方
最后 分享一个 之前看到的编程思想
大道若简
翻译一下的意思是 难得我也不会 。
别管程序优不优美 别管容错 先做出来
尽管最近在读clean code 和 software engineering (顺便吐槽一下 期末软工分给我给的真的好低 我可能真的不适合 写卷子 给个中指先) 上面有说 好的 代码格式 和 命名规则 和 布局结构
饭一口一口吃吧 别只看到 一步到位的办法 我觉得对于平常人来说 code review 和 refactoring 是更加更加重要的。
emm Code Review & refactoring
Talk is cheap ,code has been shown
【Final】
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 15 11:44:33 2018
@author: LeoNardo
"""
import urllib
import urllib.request
from lxml import etree
import json
if __name__ == "__main__":
count=0
saveFile=r"C:\Users\LeoNardo\Desktop\apk_bk_3.json"
for i in range(1,2):
try:
url = r"http://m.anzhi.com/recommend_1_%d.html"%i
page = urllib.request.urlopen(url)
html = page.read()
htmlpage = html.decode('utf-8')
selector = etree.HTML(htmlpage)
content=selector.xpath('/html/body/li')
for i in range(1,len(content)+1):
dic_analyRes={}
name= selector.xpath('/html/body/li[%s]/div[2]/h4/text()'%i)
data_name = "".join([each for each in name])
link= selector.xpath('/html/body//li[%s]/div[2]/a[2]/@href'%i)
data_link = ''.join(link)
data_link = ('http://m.anzhi.com/'+data_link)
dic_analyRes[data_name]=data_link
jsObj = json.dumps(dic_analyRes)
with open(saveFile,"a") as f:
f.write(jsObj+'\n')
f.close()
count+=1
print(count)
print(dic_analyRes)
except:
pass
continue
print("爬取条数%d"%count)