爬虫小结


代码摘自https://blog.csdn.net/luotuo818/article/details/78745841
https://blog.csdn.net/baidu_35085676/article/details/68958267
https://blog.csdn.net/u013243986/article/details/54928638

1 景点信息

# -*- coding: utf-8 -*-
import requests  
from bs4 import BeautifulSoup  
import csv  
from multiprocessing import Queue  
import  threading  
import random  
from time import sleep  
User_Agent=["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36","Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"]  
  
HEADERS = {  
    'User-Agent':  User_Agent[random.randint(0,4)],  
    # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/201002201 Firefox/55.0',  
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',  
    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',  
    'Accept-Encoding': 'gzip, deflate, br',  
    'Cookie': '',  
    'Connection': 'keep-alive',  
    'Pragma': 'no-cache',  
    'Cache-Control': 'no-cache'  
}
  
csvfile = open('F://5.csv','w',encoding='utf-8', newline='')  
writer = csv.writer(csvfile)  
writer.writerow(["区域","名称","景点id","类型","级别","热度","地址","特色","经纬度"])  
  
def download_page(url):  # 下载页面  
    try:  
        data = requests.get(url, headers=HEADERS, allow_redirects=True).content  # 请求页面,获取要爬取的页面内容  
        return data  
    except:  
        pass  
  
#下载页面 如果没法下载就 等待1秒 再下载  
def download_soup_waitting(url):  
    try:  
        response= requests.get(url,headers=HEADERS,allow_redirects=False,timeout=5)  
        if response.status_code==200:  
            html=response.content  
            html=html.decode("utf-8")  
            soup = BeautifulSoup(html, "html.parser")  
            return soup  
        else:  
            sleep(5)  
            print("等待ing")  
            return download_soup_waitting(url)  
    except:  
        return ""  
  
def getTypes():    
        url="http://piao.qunar.com/ticket/list.htm?keyword=%E7%83%AD%E9%97%A8%E6%99%AF%E7%82%B9&region=&from=mpl_search_suggest&city=%E8%A5%BF%E5%AE%89&subject=&page=1"  
        i=0
        getType(url,i)
  
def getType(url,i):
    soup=download_soup_waitting(url)  
    search_list=soup.find('div', attrs={'id': 'search-list'})  
    sight_items=search_list.findAll('div', attrs={'class': 'sight_item'})
            
    for sight_item in sight_items:  
        name=sight_item['data-sight-name']  
        districts=sight_item['data-districts']  
        point=sight_item['data-point']  
        address=sight_item['data-address']  
        data_id=sight_item['data-id']  
        level=sight_item.find('span',attrs={'class':'level'})  
        if level:  
            level=level.text  
        else:  
            level=""  
        product_star_level=sight_item.find('span',attrs={'class':'product_star_level'})  
        if product_star_level:  
            product_star_level=product_star_level.text  
        else:  
            product_star_level=""  
        intro=sight_item.find('div',attrs={'class':'intro'})  
        if intro:  
            intro=intro['title']  
        else:  
            intro=""  
        writer.writerow([districts.replace("\n",""),name.replace("\n",""),data_id.replace("\n",""),level.replace("\n",""),product_star_level.replace("\n",""),address.replace("\n",""),intro.replace("\n",""),point.replace("\n","")])  
    next=soup.find('a',attrs={'class':'next'})  
    if next:  
        next_url="http://piao.qunar.com"+next['href']
        print (i)
        sleep(8)
        getType(next_url,i+1)  
  
if __name__ == '__main__':  
    getTypes()  


2 苹果app store评论

# -*- coding: utf-8 -*-
import random
import requests
import re
from time import sleep
from bs4 import BeautifulSoup

User_Agent=["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36","Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"]
HEADERS = {
    'User-Agent':  User_Agent[random.randint(0,4)],  
    # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/201002201 Firefox/55.0',  
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',  
    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',  
    'Accept-Encoding': 'gzip, deflate, br',  
    'Cookie': '',  
    'Connection': 'keep-alive',  
    'Pragma': 'no-cache',  
    'Cache-Control': 'no-cache'  
}

def getHTMLText(url):
    try:
        response= requests.get(url,headers=HEADERS,allow_redirects=False,timeout=5)  
        if response.status_code==200:
            html=response.content
            html=html.decode("utf-8")
            soup = BeautifulSoup(html, "html.parser")
            return soup  
        else:  
            sleep(5)  
            print("等待ing")  
            return download_soup_waitting(url)
    except:
        return ''

def printAPPName(html):
    try:
        pattern = re.compile(r'{"im:name":{"label":(.*?)}, "rights"', re.S)
        #如果不使用re.S参数,则只在每一行内进行匹配,如果一行没有,就换下一行重新开始,不会跨行。
        #而使用re.S参数以后,正则表达式会将这个字符串作为一个整体,将“\n”当做一个普通的字符加入到这个字符串中,在整体中进行匹配
        APPName = re.findall(pattern, str(html))
        return 'APPName:' + str(APPName)
    except:
        return ''

def fillUnivlist(titles, comments, stars, html):
    try:
        pattern = re.compile(r'"title":{"label":(.*?)}, "content"', re.S) #提取标题
        nbaInfo = re.findall(pattern, str(html)) #提取title

        # findStr = '"title":{"label":'
        # nbaInfo = nbaInfo1[nbaInfo1.find(findStr)+len(findStr):]
        patternFloor = re.compile(r'"content":{"label":(.*?), "attributes":{"type":"text"}}', re.S) #提取content
        floorText = re.findall(patternFloor, str(html))

        patternStar = re.compile(r'"im:rating":{"label":(.*?)}, "id"', re.S)  # 提取星级
        star = re.findall(patternStar, str(html))
        # print(str(star))

        number = len(nbaInfo)
        print(number)
        for i in range(number):
            Info = nbaInfo[i] #利用Tools类移除不想要的格式字符
            if i==0:Info = Info[Info.find('"title":{"label":')+len('"title":{"label":'):]
            # print(Info)
            Info1 = floorText[i]
            Info2 = star[i]
            # print(Info2+"hello")
            titles.append('title:' + Info)
            comments.append('content:' + Info1)
            stars.append('star:' + Info2)
    except:
        return ''

def writeText(titleText, fpath):
    try:
        with open(fpath, 'a', encoding='utf-8') as f:
            f.write(str(titleText)+'\n')
            f.write('\n')
            f.close()
    except:
        return ''

def writeUnivlist(titles, comments, stars, fpath, num):
    with open(fpath, 'a', encoding='utf-8') as f:
        for i in range(num):
            f.write(str(stars[i]) + '\n')
            f.write('*' * 10 + '\n')
            f.write(str(titles[i]) + '\n')
            f.write('*' * 50 + '\n') #输入一行*号
            f.write(str(comments[i]) + '\n')
            f.write('*' * 100 + '\n')
        f.close()

def main():
    count = 0
    url = 'https://itunes.apple.com/rss/customerreviews/page=1/id=481623196/sortby=mostrecent/json?l=en&&cc=cn' #要访问的网址
    output_file = 'D:/info.txt' #最终文本输出的文件
    html = getHTMLText(url) #获取HTML
    APPName = printAPPName(html)
    writeText(APPName, output_file)
    for i in range(3):
        i = i + 1
        titles = []
        comments = []
        stars = []
        url = 'https://itunes.apple.com/rss/customerreviews/page=' + str(i) + '/id=481623196/sortby=mostrecent/json?l=en&&cc=cn'
        html = getHTMLText(url)
        #print(html)
        
        fillUnivlist(titles, comments, stars, html)
        writeUnivlist(titles, comments, stars, output_file, len(titles))
        
        count = count + 1
        print("\r当前进度: {:.2f}%".format(count * 100 / 10), end="")

if __name__ == '__main__':
    main()

3 爬取妹子照片

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import random
my_headers = [
    'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30',
    'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)',
    'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)']

header={"User-Agent":random.choice(my_headers)}
Picreferer = {
    'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Referer':'http://i.meizitu.net'
}

r=requests.get("http://www.mzitu.com/131411", headers=header)
soup=BeautifulSoup(r.text, 'lxml')
image=soup.find_all('p')[0].find_all('img')[0]['src']
print(image)

html=requests.get(str(image),headers=Picreferer)
with open("d://a.jpg",'wb') as f:
    f.write(html.content)
    f.close()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值