python3.3 lxml+beautifulsoup 爬虫说明

最新推荐文章于 2021-01-30 12:04:42 发布

qq_23438131

最新推荐文章于 2021-01-30 12:04:42 发布

阅读量1.5k

点赞数

本文链接：https://blog.csdn.net/qq_23438131/article/details/52219763

版权

1.安装python3.3版本

2.安装pip;

3.安装bs4和lxml工具包

安装bs4：pip install bs4或bs4.exe‘

安装lxml：http://blog.csdn.net/qq_23438131/article/details/52222489

4.控制编码格式：

#coding:utf-8

import sys  
reload(sys)  
sys.setdefaultencoding('utf-8')

5.引用bs4

import bs4
from bs4 import BeautifulSoup as bs

6.根据关键字百度搜索

#coding:utf-8
import bs4
from bs4 import BeautifulSoup as bs
import urllib.parse
import urllib.request

import functools
import re
import time

from time import sleep

#import socket
#socket.setdefaulttimeout(3)

class BaiduSpider(object):
    def __init__(self,word,max_link):
        self._word = word
        self._max_link = max_link
        p = {"word":word}
        self._start_url = "http://www.news.baidu.com/ns?" + urllib.parse.urlencode(p)

    def _get_links(self):
        links = []
        links.append(self._start_url)
        try:
            soup = bs(self._get_html(self._start_url),"lxml")
            links_tag = soup.select("#page")
        except AttributeError as e_Att:
            print(e_Att)
            time.sleep(10)
            return self._get_links()
        if 0 != len(links_tag):
            links_tag = links_tag[0]
        #get the second page link
        for child in links_tag.children:
            attr = child.attrs
            if attr:
                links.append("http://www.news.baidu.com" + attr["href"])
                break
        #get 20~800 news links
        for i in range(20,810,10):  
            link_temp = links[1].__str__()
            PatternObj = re.compile('&pn=(\\d)+?&')
            newLink = PatternObj.subn('&pn='+str(i)+'&', link_temp )
            links.append(str(newLink[0]))
        end = self._max_link if self._max_link < len(links) else len(links)

        return links[:end]
    
    def _rightTime(self,summary):
        '''
        判断summary中的时间是否在2016年6月1日至今
        中国基金网  14小时前
        网易新闻  2016年08月12日 16:35
        '''
        #2016-06-01转化为datetime
        try:
            startDate_str = '2016-06-01'
            startTime =  time.mktime(time.strptime(startDate_str, '%Y-%m-%d'))
            a = summary.split()
            time_in_text = a[1]
            if '年' in time_in_text:
                time_in_text = time_in_text.split(" ")[0]
                time_in_text = time_in_text.replace("年",'-').replace("月",'-').replace("日",'')
                textTime = time.mktime(time.strptime(time_in_text, '%Y-%m-%d'))
                if (float(textTime))<=(float(startTime)):
                    return False
            return True
        except ValueError:
            print (time_in_text)
    
    
    def _get_html(self,link):
        res = urllib.request.urlopen(link)
        return res.read().decode("utf-8")
    
    def _get_html_Content_post(self,link,f_error,retries):
        print (link,'open the link using the post method:',time.time())
        html_content = ''
        try:
            request = urllib.request.Request(link)
            res =urllib.request.urlopen(request,timeout=3)
            html_content = res.read()
        except Exception as e:       #爬虫卡住或其他异常，则再次尝试，尝试机会有3次
            print(link+'\n')
            print(e)
            f_error.write(link+'\n')
            if retries:
               return self._get_html_Content_post(link, f_error,retries-1)
        print ('close:',time.time())
        return html_content
    
        
    def _get_html_Content(self,link, f_error,retries=2):
        print (link,'\n','open the link:',time.time())
        html_content = ''
        try:
            user_agent='Mozilla/4.0(compatible;MSIE 5.5;Windows NT)'
            headers={'User-Agent':user_agent}
            request = urllib.request.Request(link)
            request.add_header('User-Agent', user_agent)
            #timeout=2
            res =urllib.request.urlopen(request,timeout=3)
            html_content = res.read()
        except Exception as e:       #爬虫卡住或其他异常，则再次尝试，尝试用post方式打开
            print(link+'\n')
            print(e)
            f_error.write(link+'\n')
            if retries:
                return self._get_html_Content_post(link, f_error,retries=3)
        print ('close:',time.time())
        return html_content

    def _get_content(self,content):
        # 先要把bs4.element.NavigableString类型转化为string类型
        return functools.reduce(lambda x,y:x+y,map(lambda x:x.replace("<em>","").replace("</em>",""),
                                     map(lambda x:x.string,content)))
    def _spiderDetail(self, link,f_error,Verbdic):
        '''
        input:link,f_error
        output:contents contained xiepeiyiverb
        通过第一步获取的URL，得到新闻所在的内容页面URL，由于百度新闻列表页面上的新闻来自不同的站，
        所以很难找到一个通用的结构，大多数的新闻类网站，内容都是放在p标签内，所以就采用了如下的方式获取新闻的内容
        '''
        html_content = self._get_html_Content(link, f_error,retries=2)
        contents =''
        if html_content != '':
            soup = bs(html_content,"lxml")
            #reg=u".+?带领"
            #Res = re.compile(reg)
            #contents = soup.findAll(name="p", text=Res)
            contents = '<p>'
            iter = []
            nodes_p = soup.find_all(name='p')
            for n in nodes_p:
                p_cont = n.get_text(strip=True)
                for ver in Verbdic:
                    if ver in p_cont:
                        iter.append(p_cont)
                        break
            contents = contents.join(iter)
        return contents
        
    
    def _spider(self,f, f_error,Verbdic):
        '''
            百度新闻列表页面，
            根据关键词检索新闻，
            获取新闻标题、来源及时间、链接、链接页面文字
        '''
        total_links = self._get_links()
        print (total_links)
        for i,l in enumerate(total_links):
            print ("Page {0}".format(i+1))
            soup = bs(self._get_html(l),"lxml")
            # 找到左边内容到的跟节点
            left_div = soup.select("#content_left")[0] 
            # base_div_list是一个新闻列表
            for child_div in left_div.children:
                if isinstance(child_div,bs4.element.Tag) and child_div.div and child_div.div.get('class') and'result' in child_div.div['class']:
                    base_div = child_div
  
            childs = base_div.children
            for child in childs:
                title = child.select(".c-title")[0]
                summary = ""
                summary = summary.join(self._get_content(child.select(".c-summary")[0].p.contents))
                a_link = title.a["href"]
                titlename = ""
                titlename = titlename.join(self._get_content(title.a.contents))
                #爬取新闻内容网页
                content = ''
                if self._rightTime(summary):
                    content = self._spiderDetail(a_link, f_error,Verbdic)
                f.write ('标题:'+titlename+'\t来源及时间:'+summary+
                         '\t链接:'+a_link
                         +'\t新闻内容:'+content+"\n")
                   
    def start(self,f, f_error,Verbdic):
        self._spider(f,f_error,Verbdic)

if '__main__' == __name__:
    '''
    f存储爬取结果
    #f_error存储读取新闻内容错误的链接
    '''
    Verbdic = [
               '协同','协助'
               ]
    with open("links2.txt",'wt',encoding='utf-8') as f, open("logError2.txt",'wt') as  f_error, open("overVerb.txt",'wt') as f_over:
        for keyword in Verbdic:
            baidu_spider = BaiduSpider(keyword,800)
            baidu_spider.start( f, f_error,Verbdic)
            f_over.write(keyword+'\n')

7.爬虫问题：

1.Python程序卡住：原因是链接的网站反爬虫、get/post方式错误、网络问题等。

解决方法一：模拟浏览器上网：

            user_agent='Mozilla/4.0(compatible;MSIE 5.5;Windows NT)'
            headers={'User-Agent':user_agent}
            request = urllib.request.Request(link)
            request.add_header('User-Agent', user_agent)

解决方法二：超时重试：

<span style="white-space:pre">	</span>try:
        request = urllib.request.Request(link)

<span style="white-space:pre">	</span>    res =urllib.request.urlopen(request,timeout=3)
            html_content = res.read()
        except Exception as e:       #爬虫卡住或其他异常，则再次尝试，尝试用post方式打开
            print(link+'\n')
            print(e)
            f_error.write(link+'\n')
            if retries:
                return self._get_html_Content_post(link, f_error,retries=3)

解决方法三：如果模拟浏览器方式无法打开网页，即无法用get方式打开网页，则采用post方式打开网页：

    def _get_html_Content_post(self,link,f_error,retries):
        print (link,'open the link using the post method:',time.time())
        html_content = ''
        try:
            request = <span style="font-family: Arial, Helvetica, sans-serif;">urllib.request.</span><span style="font-family: Arial, Helvetica, sans-serif;">.Request(link)</span>
            res = <span style="font-family: Arial, Helvetica, sans-serif;">urllib.request</span><span style="font-family: Arial, Helvetica, sans-serif;">.urlopen(request,timeout=3)</span>
            html_content = res.read()
        except Exception as e:       #爬虫卡住或其他异常，则再次尝试，尝试机会有3次
            print(link+'\n')
            print(e)
            f_error.write(link+'\n')
            if retries:
               return self._get_html_Content_post(link, f_error,retries-1)
        print ('close:',time.time())
        return html_content