-
简介
urllib和requests都可以进行页面爬取操作,通过urllib.urlopen()
、requests.get()
等方法传入url从而获取url页面等信息.另外可以通过lxml中的etree方法通过xpath解析DOM树.其他的库比如BeautifulSoup和lxml一样都可以用来解析xml/html.
但是他们无法执行javaScript代码,所以可以使用selenium,它可以模拟人操作请求网页.(selenium最初是一个自动化测试工具). -
使用实例代码(requests+lxml)
# -*- coding: UTF-8 -*- import sys reload(sys) sys.setdefaultencoding('utf8') import requests # lxml支持html和xml解析,etree通过xpath解析DOM树 from lxml import etree class paramItem(object): # 安全公告标题 name=None # 所有安全公告日期时间 time=None # 安全标题的全url路径 urlhref=None # 一共有多少页安全公告 max_pages = 0 # 将每个链接中的描述信息放入到urldetail数组中 urldetail=None origin=1#1=阿里云 now_totalCount=None # 获取安全公告发布的日期时间集合,放入到totalCount数组中 def getTotalCount(): totalCount=[] getContent = requests.get('https://help.aliyun.com/notice_list_page/9213612/1.html') html = etree.HTML(getContent.text) # // 从匹配选择的当前节点选择文档中的节点,而不考虑它们的位置 # span[@class='y-right'] 选取所有class属性值为y-right的span元素。 date=html.xpath("//ul/li/span[@class='y-right']/text()") time=html.xpath("//ul/li//span[@class='time']/text()") for i in range(len(date)): totalCount.append(date[i]+' '+time[i])# 设置目前的时间为标准线 # page = html.xpath("//script[last()]/text()") # string1 = str(page[0])#转换类型string # list1= string1.split(":")#取值处理 # totalCount = int(list1[1].split(",")[0])#获取总条数 return totalCount class getThreaten_ali(object): def __init__(self): self.url='https://help.aliyun.com/notice_list_page/9213612/' self.items=self.getContent() def getContent(self): items=[] number = 1 while True: getContent = requests.get(self.url + str(number) + '.html') number += 1 item = paramItem() # 实例化为对象item html = etree.HTML(getContent.text) # 获取安全公告标题 temp_title_name = html.xpath("//ul/li/a/text()") # 赋予属性值 # getContent = requests.get(self.url) if len(temp_title_name) is 0: return items else: item.name = temp_title_name # item.max_pages = html.xpath("//div[@class='dn-pagination-list dn-hidden-xs']/div[@class='dn-pagination-item last']") item.time=[] # 获取安全公告日期时间 date = html.xpath("//ul/li/span[@class='y-right']/text()") time = html.xpath("//ul/li//span[@class='time']/text()") for i in range(len(date)): item.time.append(date[i] + ' ' + time[i]) #日期+时间 # 获取安全标题链接 item.urlhref = html.xpath("//ul/li/a/@href") urldetail=[] # 安全标题的全url路径 for i in range(len(item.urlhref)): item.urlhref[i]='https://help.aliyun.com' + str(item.urlhref[i]) for j in item.urlhref: # 获取安全标题链接中的内容 getContent2 = requests.get(j) html2 = etree.HTML(getContent2.text) # 安全标题链接中的一些描述信息 detail = html2.xpath("//div[@class='notice-main-center']//text()") resstr = "" for text in detail: newtext = str(text).strip() resstr += newtext + '\n' # 将每个链接中的描述信息放入到urldetail数组中 urldetail.append(resstr) item.now_totalCount=item.time[0] item.urldetail = urldetail items.append(item) if __name__ == '__main__': c = getThreaten_ali() print(len(c.items))
-
使用代码实例2(urllib)
# 时间格式转换 def time_formation(time_str): # '2017-10-19 02:30:01 (UTC)#' # compile 函数用于编译正则表达式,生成一个正则表达式( Pattern )对象,供 match() 和 search() 这两个函数使用。 # \d:转义字符,表示匹配数字 date_pattern = re.compile(r'(\d+)-(\d+)-(\d+)') time_pattern = re.compile(r'(\d+):(\d+):(\d+)') # re.search 扫描整个字符串并返回第一个成功的匹配。返回(起始位置:结束位置+1) date_result = re.search(date_pattern, time_str) time_result = re.search(time_pattern, time_str) # group: 获取分段截获的字符串 # time_orignal: 2017-10-19 02:30:01 time_orignal = date_result.group() + ' ' + time_result.group() # 将time_orignal根据给定的格式"%Y-%m-%d %H:%M:%S"进行解析,返回一个struct_time对象 # 结果为time.struct_time(tm_year=2017, tm_mon=10, tm_mday=19, tm_hour=2, tm_min=30, tm_sec=1, tm_wday=3, tm_yday=292, tm_isdst=-1) timeArray = time.strptime(time_orignal, "%Y-%m-%d %H:%M:%S") # 接收struct_time对象作为参数,返回用秒数来表示时间的浮点数 timeStamp = int(time.mktime(timeArray)) # ctime:将秒数浮点数转换为asctime格式:Thu Oct 19 02:30:01 2017 TIME = time.ctime(timeStamp).encode('utf-8') return TIME def get_ip(string_ip): result = re.findall( r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b", string_ip) if result: return result else: return 'None' def download_ip(source, stamp, url): file = open('tmp.txt', 'w') # claw the web and download the web-file try: resp = urlopen(url) html_data = resp.read().decode('utf-8') file.write(html_data) except Exception as e: print('ERROR :', e) exit() file.close() # Parse the file and get eachline into the MySQL database update_time = time.ctime().encode("utf-8") fo = open('tmp.txt', 'r') # 连接数据库 db = MySQLdb.connect(user='用户名', db='数据库名', passwd='密码', host='主机', charset='utf8') # 游标:用于执行查询和获取结果。 # 游标是一种能从包括多条数据记录的结果集中每次提取一条记录的机制。即游标用来逐行读取结果集。游标充当指针的作用 cursor = db.cursor() for eachline in fo: # insert data into 'url_table' table if '#' in eachline: if 'updated' in eachline: try: update_time = time_formation(eachline) except: print('error in line :', eachline) continue else: ipgroup = get_ip(eachline) for ip in ipgroup: if len(ip) < 7: continue try: sql = "replace into ip_table(ip, update_time, source, stamp) " \ "values ('%s', '%s', '%s', '%s')" % (ip, update_time, str(source), stamp) cursor.execute(sql) db.commit() except MySQLdb.Error, e: print "Error ocurred: %s " % e.args[0] print e db.rollback() db.commit() db.close() fo.close() source = '10' url = 'https://raw.githubusercontent.com/firehol/blocklist-ipsets/master/cruzit_web_attacks.ipset' stamp = 'attack' download_ip(source,stamp,url)
爬虫爬取网站数据(urllib、requests、lxml)
最新推荐文章于 2024-10-17 10:23:30 发布