Python爬取天涯帖子

# -*- coding:utf-8 -*-  
from lxml import html
from Tools import ToolsMethods
import requests

def parseHtmlTrees(tree):
    # //*[@id="10"]/div[2]/div[2]/div[1]
    contents = tree.xpath(u'//*[@_host="李苏爱叨叨"]/div[2]/div[2]/div[1]/text()')
    # contents = tree.xpath('//div[@class="bbs-content"]/text()')
    for e in contents:
        print e


if __name__ == '__main__':
    tool_obj = ToolsMethods()
    tree = tool_obj.getHtmlTrees('http://bbs.tianya.cn/post-1095-24930-1.shtml')
    parseHtmlTrees(tree)
复制代码

tools.py

# -*- coding: UTF-8 -*-
import random
import re
import requests
import pymysql
import hashlib
import os
from lxml import html


HaveLoadList = [] 

class ToolsMethods(object):

    def getHeads(self):
        agentList = ['Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0','Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko','Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0','User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0','User-Agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Mobile Safari/537.36','User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5','User-Agent:Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36 Request Payload','User-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:56.0)','User-Agent	Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko','User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','User-Agent:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)','User-Agent:Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)','User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)','User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','User-Agent:Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11','User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11','User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11','User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)','User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)','User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)','User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)','User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)','User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)','User-Agent:Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','User-Agent:Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','User-Agent:Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5']
        random_num = random.randint(1,len(agentList))
        user_agent = agentList[random_num-1]
        # myHeaders = {'User-Agent': user_agent}
        myHeaders = {
            'Host' : 'bbs.tianya.cn',
            'User-Agent' : user_agent,
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Connection':'keep-alive'
        }
        return myHeaders

    def getTitleUrl(object):
        connect = pymysql.connect(
        host='localhost', 
        user='root', 
        passwd='44124985',  
        db='acgcloumn',  
        port=3306,  
        charset='utf8'
        )
        # 获取游标
        cursor = connect.cursor()
        cursor.execute("use acgcloumn")
        sql = 'select weburl,id from cloumn_tab where id > 8'
        cursor.execute(sql)
        result = cursor.fetchall()
        urlList = []
        idlist = []
        for a in result:
            urlList.append(a[0])
            idlist.append(a[1])
        connect.commit()
        connect.close()
        return (urlList,idlist)


    def getHtmlTrees(self,pageUrl):
        toolobj = ToolsMethods()
        headerCls = toolobj.getHeads()
        response = requests.get(pageUrl,headers=headerCls)
        statueCode = response.status_code
        page_content = response.text
        if statueCode == 404:
            print 404
            return 404
        tree = html.fromstring(page_content)
        return tree

    def stringIntoMD5(self,str):
        m = hashlib.md5()
        m.update(str)
        md5Str = m.hexdigest()
        if md5Str not in HaveLoadList:
            HaveLoadList.append(md5Str)
            return True
        else:
            return False

    def downLoadPicture(self,picUrl,picName,path):
        # urllib.urlretrieve(picUrl,'f:\\acgTitleImg\\'+path+'\\'+picName)#urllib一句代码下载
        tools_obj = ToolsMethods()
        headers = tools_obj.getHeads()
        path_pic = 'f:\\acgTitleImg\\'+path+'\\'+picName
        
        picResponse = requests.get(picUrl)
        if picResponse.status_code == 200:
            open(path_pic, 'wb').write(picResponse.content)
复制代码
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值