crawler(2)


设置代理

from urllib import request


def use_proxy(proxy_adrr,url):
    """
    this function use a proxy to open a webpage

    args:
        proxy_adrr: string, proxy address
        url: string, url to open 

    return:
        data: the data of the webpage
    """
    # 设置一个ProxyHandler对象
    proxy=request.ProxyHandler({'http':proxy_adrr})
    # 建立一个含有ProxyHandler和HTTPHandler的opener
    own_opener=request.build_opener(proxy,request.HTTPHandler)
    # 安装全局opener
    request.install_opener(own_opener)
    data=request.urlopen(url).read().decode('utf-8')
    return data


proxy_adrr="183.47.40.35:8088"  # 找验证时间较短的代理ip,成功率比较高
url='http://www.baidu.com'
data=use_proxy(proxy_adrr,url)
print(len(data))
277

异常处理

from urllib import request
from urllib import error


try:
    data=request.urlopen('http://blog.csdn.net').read()
    hf=open(r'D:\pythoncode\crawler\test.html','wb')
    hf.write(data)
    hf.close()
except error.URLError as e:
    # print(e.code) URLerror 没有code
    print(e.reason)

Cookies

from urllib import request, parse
from http import cookiejar


url='http://bbs.chinaunix.net/member.php?mod=logging&action=login&action=login&loginsubmit=yes&loginhash=L768q'
# encode the post data
postdata=parse.urlencode({
    'username':'weisuen',
    'password':'aA123456'
}).encode('utf-8')

# request object
req=request.Request(url,postdata)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36')
# 创建cookiejar对象
cjar=cookiejar.CookieJar()
# 使用HTTPCookieProcessor创建cookie处理器,并以其为参数构建opener对象
opener=request.build_opener(request.HTTPCookieProcessor(cjar))
# 全局安装opener
request.install_opener(opener)
file=opener.open(req)
data=file.read()
hf=open(r'D:\pythoncode\crawler\cookie1.html','wb')
hf.write(data)
hf.close()
url2='http://bbs.chinaunix.net'
data2=request.urlopen(url2).read()
hf2=open(r'D:\pythoncode\crawler\cookie2.html','wb')
hf2.write(data2)
hf.close()

图片爬虫实战

爬取京东手机类商品的全部图片

首页:https://list.jd.com/list.html?cat=9987,653,655
第二页: https://list.jd.com/list.html?cat=9987,653,655&page=2&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main
第三页:https://list.jd.com/list.html?cat=9987,653,655&page=3&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main

可以发现页数的获取是通过url给定的,所以要用get方法,而页数的关键字段是page,page的值对应了相应的页数。

import re
from urllib import request,error


def crawl(url,page):
    """
    this function craw the images given the url and the page of the web

    args:
        page: int , the number of webpage you want to craw
        url; the start url of the webpage
    """
    # craw the source code of the webpage
    html_source=request.urlopen(url).read()
    # change the code to string
    html_sourcestr=str(html_source)
    # 选取要爬取得代码段
    pattern1='<div id="plist".+? <div class="page clearfix">'
    result1=re.compile(pattern1).findall(html_sourcestr)
    paragraph=result1[0]
    fh=open(r'D:\pythoncode\crawler\jingdongpic\image\1.html','w')
    fh.write(paragraph)
    fh.close()
    # 在代码段中选取要爬取的图片链接,正则表达式中()代表提取匹配字符
    pattern2='<img width="220" height="220" data-img="1" src="//(.+?\.jpg)">'
    imagelist=re.compile(pattern2).findall(paragraph)
    fh1=open(r'D:\pythoncode\crawler\jingdongpic\image\2.html','w')
    for image in imagelist:
        fh1.write(image+'\n')
    fh1.close()
    x=1
    for image in imagelist:
        image_file='D:/pythoncode/crawler/jingdongpic/image/'+str(page)+str(x)+'.jpg'
        image_url='http://'+image
        try:
            request.urlretrieve(image_url,filename=image_file)
        except error.URLError as e :
            if hasattr(e,'code'):
                print(e.code)
                x+=1
            elif hasattr(e,'reason'):
                print(e.reason)
                x+=1
        x+=1

        
url='https://list.jd.com/list.html?cat=9987,653,655&page='+str(1)
crawl(url,1)

程序运行后发现只能下载10张图片,经过对网页源代码的分析,发现源代码中关于image的链接除了 img width=“220” height=“220” data-img=“1” src="//img14.360buyimg.com/n7/jfs/t13441/73/1250191369/239632/8b94bbc6/5a1d1e2dN6ba9aac4.jpg" 这种形式之外,还有 img width=“220” height=“220” data-img=“1” data-lazy-img="//img10.360buyimg.com/n7/jfs/t1/3405/18/3537/69901/5b997c0aE5dc8ed9f/a2c208410ae84d1f.jpg" ,同时还有png格式的图片,所以上面这个程序中关于提取图片链接的正则表达式就不能全部涵盖了,下面的程序主要改进了链接的提取

import re
from urllib import request,error


def crawl(url,page):
    """
    this function craw the images given the url and the page of the web

    args:
        page: int , the number of webpage you want to craw
        url; the start url of the webpage
    """
    # craw the source code of the webpage
    html_source=request.urlopen(url).read()
    # change the code to string
    html_sourcestr=str(html_source)
    # 选取要爬取得代码段
    pattern1='<div id="plist".+? <div class="page clearfix">'
    result1=re.compile(pattern1).findall(html_sourcestr)
    paragraph=result1[0]
    fh=open(r'D:\pythoncode\crawler\jingdongpic\image\1.html','w')
    fh.write(paragraph)
    fh.close()
    # 在代码段中选取要爬取的图片链接,正则表达式中()代表提取匹配字符
    pattern2='<img width="220" height="220" data-img="1" (.+?)>'
    imagelist=re.compile(pattern2).findall(paragraph)
    return imagelist


url='https://list.jd.com/list.html?cat=9987,653,655&page='+str(1)
a=crawl(url,1)
a   
['src="//img14.360buyimg.com/n7/jfs/t1/204/12/2599/82133/5b963c1aEd9fa390e/970adadd473ff485.jpg"',
 'src="//img13.360buyimg.com/n7/jfs/t19912/33/927199229/297549/8c269ff0/5b0fca0fN8d5600df.jpg"',
 'src="//img10.360buyimg.com/n7/jfs/t28618/324/444877994/238412/bda83f0c/5bf3c4feN776b598c.jpg"',
 'src="//img14.360buyimg.com/n7/jfs/t13441/73/1250191369/239632/8b94bbc6/5a1d1e2dN6ba9aac4.jpg"',
 'src="//img12.360buyimg.com/n7/jfs/t21415/332/642302956/189613/778f2021/5b13cd6cN8e12d4aa.jpg"',
 'src="//img13.360buyimg.com/n7/jfs/t26038/101/1750775983/176935/5976cd0b/5bbc6e6fN5216f959.jpg"',
 'src="//img10.360buyimg.com/n7/jfs/t18157/222/1822300674/231514/6c179af8/5ad87390N086a3c91.jpg"',
 'src="//img11.360buyimg.com/n7/jfs/t22330/332/515182850/188708/3dbe80f8/5b0fbaabN3229c7a3.jpg"',
 'src="//img11.360buyimg.com/n7/jfs/t25954/134/1930444050/488286/31587d0d/5bbf1fc9N3ced3749.jpg"',
 'src="//img14.360buyimg.com/n7/jfs/t18052/318/2334327001/256076/23da5f45/5af13917Naca6cb3d.jpg"',
 'data-lazy-img="//img10.360buyimg.com/n7/jfs/t1/3405/18/3537/69901/5b997c0aE5dc8ed9f/a2c208410ae84d1f.jpg"',
 'data-lazy-img="//img13.360buyimg.com/n7/jfs/t10675/253/1344769770/66891/92d54ca4/59df2e7fN86c99a27.jpg"',
 'data-lazy-img="//img11.360buyimg.com/n7/jfs/t20638/302/805235103/272149/fdafea5c/5b17a2ceN24d043fc.jpg"',
 'data-lazy-img="//img13.360buyimg.com/n7/jfs/t1/3/15/4536/138660/5b997bf8Ed72ebce7/819dcf182d743897.jpg"',
 'data-lazy-img="//img10.360buyimg.com/n7/jfs/t20305/259/1209609364/193755/a3940552/5b21ce25N131ce626.jpg"',
 'data-lazy-img="//img11.360buyimg.com/n7/jfs/t18010/178/1266475805/124462/64f61c52/5ac1f20cN7196beba.jpg"',
 'data-lazy-img="//img11.360buyimg.com/n7/jfs/t10387/284/1763459074/210785/bf836347/59e5bd29N7bdd3d97.jpg"',
 'data-lazy-img="//img11.360buyimg.com/n7/jfs/t24202/21/1240414017/266807/216c76b7/5b5705a1N6a12c28c.jpg"',
 'data-lazy-img="//img14.360buyimg.com/n7/jfs/t23653/202/1047354722/252149/77642e5a/5b4ee1a1Nb44ade36.jpg"',
 'data-lazy-img="//img13.360buyimg.com/n7/jfs/t22099/97/2250481418/237159/c4a1502d/5b4edfdcN5a7d6faf.jpg"',
 'data-lazy-img="//img12.360buyimg.com/n7/jfs/t7582/66/3048380492/71753/acde79b5/59b85824N836bb714.jpg"',
 'data-lazy-img="//img12.360buyimg.com/n7/jfs/t13354/123/2593801762/295225/2595fcdd/5a433a4aN0ff580a8.jpg"',
 'data-lazy-img="//img13.360buyimg.com/n7/jfs/t8284/363/1326459580/71585/6d3e8013/59b857f2N6ca75622.jpg"',
 'data-lazy-img="//img14.360buyimg.com/n7/jfs/t19786/94/1948790595/190301/ad172397/5adfe50bNd5907d0b.jpg"',
 'data-lazy-img="//img12.360buyimg.com/n7/jfs/t25696/183/1719981196/90401/bcf6106c/5bbac3c5N8b0bd22b.jpg"',
 'data-lazy-img="//img10.360buyimg.com/n7/jfs/t17689/292/1247919821/159809/1c87eb05/5ac1eae4Nce7c8b00.jpg"',
 'data-lazy-img="//img13.360buyimg.com/n7/jfs/t27112/273/1423275096/265013/d92b3181/5be3cb5bN334c8048.jpg"',
 'data-lazy-img="//img11.360buyimg.com/n7/jfs/t20212/346/2286611589/200381/b60dd7b9/5b4ee328Ne725d6fc.jpg"',
 'data-lazy-img="//img11.360buyimg.com/n7/jfs/t1/2533/19/5800/382950/5ba0b1b7Eb550a26e/934ebb6f3f60e2e6.jpg"',
 'data-lazy-img="//img14.360buyimg.com/n7/jfs/t25582/259/1942499054/80811/1fd3432/5bc06426Nc4199ba0.jpg"',
 'data-lazy-img="//img14.360buyimg.com/n7/jfs/t1/1867/31/11716/401006/5bd072f8E6db292ab/f3610e2e816ade0f.jpg"',
 'data-lazy-img="//img14.360buyimg.com/n7/jfs/t17665/190/2499640910/209789/1b439bbd/5afc0ae1N4f34d0fc.jpg"',
 'data-lazy-img="//img14.360buyimg.com/n7/jfs/t1/1156/8/14017/123589/5bd9a4e8E7dbd4a15/70fbbccdf8811111.jpg"',
 'data-lazy-img="//img12.360buyimg.com/n7/jfs/t21043/186/220467895/46630/3417464c/5b0517ccN295c6fdb.jpg"',
 'data-lazy-img="//img13.360buyimg.com/n7/jfs/t11986/295/1484411523/155164/77795126/5a01503cN19d7f1a0.jpg"',
 'data-lazy-img="//img10.360buyimg.com/n7/jfs/t16717/31/854833223/219683/1b8fbfc/5aaa70efNeb070fc3.jpg"',
 'data-lazy-img="//img10.360buyimg.com/n7/jfs/t1/2617/6/6143/237736/5ba1f42aE71124526/e242e3e39ec95d66.jpg"',
 'data-lazy-img="//img14.360buyimg.com/n7/jfs/t16471/32/126289826/309262/39215dc0/5a28b69bN33aaea8a.jpg"',
 'data-lazy-img="//img11.360buyimg.com/n7/jfs/t13804/159/2066981344/272858/5b9f4558/5a31f5e4N284ce5e1.jpg"',
 'data-lazy-img="//img12.360buyimg.com/n7/jfs/t1/1717/22/4137/189715/5b9b62e1E14bd7f03/c522c2da3c36757a.jpg"',
 'data-lazy-img="//img10.360buyimg.com/n7/jfs/t20140/279/2633113298/113707/57d9da77/5b6018c5N6f80495e.jpg"',
 'data-lazy-img="//img12.360buyimg.com/n7/jfs/t2302/16/135479564/94882/c76da045/55f0e877N3c24faa3.jpg"',
 'data-lazy-img="//img13.360buyimg.com/n7/jfs/t10414/363/1280355182/335902/18c2b152/59ded64fNfdb4e9da.jpg"',
 'data-lazy-img="//img14.360buyimg.com/n7/jfs/t25792/215/1911566869/331199/4e1aa140/5bbf1d23N3a4d87c5.jpg"',
 'data-lazy-img="//img14.360buyimg.com/n7/jfs/t18961/90/1109264753/173069/676b99ad/5abc8d2dNa4cc5eac.jpg"',
 'data-lazy-img="//img12.360buyimg.com/n7/jfs/t11944/171/322678362/177635/eb2192c2/59ed5428N909f5413.jpg"',
 'data-lazy-img="//img13.360buyimg.com/n7/jfs/t19495/246/1066578963/107836/c4ca2c64/5ab9bd0cNbd131502.jpg"',
 'data-lazy-img="//img10.360buyimg.com/n7/jfs/t1/21439/25/1397/189754/5c11d0bcE1fa5da35/6150196ac0af97d1.jpg"',
 'data-lazy-img="//img11.360buyimg.com/n7/jfs/t23221/165/1717864732/343200/7b196311/5b681275Nc5761208.jpg"',
 'data-lazy-img="//img14.360buyimg.com/n7/jfs/t25012/345/1827676978/130853/65940865/5bbc6efaNeb227f0b.jpg"',
 'data-lazy-img="//img12.360buyimg.com/n7/jfs/t6010/111/3843138696/73795/bf58700d/5959ab7fN154e56b4.jpg"',
 'data-lazy-img="//img11.360buyimg.com/n7/jfs/t1/2066/29/11643/417926/5bd01e90E38456795/44e8b7c92e33d359.png"',
 'data-lazy-img="//img14.360buyimg.com/n7/jfs/t1/9453/29/1482/225086/5bce9b03Edbefa238/cea9291c7ac7ea8b.jpg"',
 'data-lazy-img="//img13.360buyimg.com/n7/jfs/t20791/100/326176350/239528/dc43767c/5b0a0d77Nffab2525.jpg"',
 'data-lazy-img="//img10.360buyimg.com/n7/jfs/t26620/265/1399858770/131831/54ef50d4/5bc84703Nf58c2318.jpg"',
 'data-lazy-img="//img12.360buyimg.com/n7/jfs/t20599/138/1457715454/457274/4b6036d8/5b29e865Na93d71b3.jpg"',
 'data-lazy-img="//img11.360buyimg.com/n7/jfs/t21328/105/1076382804/333037/61eecec8/5b1f9bfeN2ebc92eb.jpg"',
 'data-lazy-img="//img12.360buyimg.com/n7/jfs/t1/4934/27/2920/428072/5b97bb0cEd0234aa9/ce434747f50912a9.jpg"',
 'data-lazy-img="//img13.360buyimg.com/n7/jfs/t1/5064/31/3461/142209/5b997c0eE8b26d23e/8788a4743af36f36.jpg"',
 'data-lazy-img="//img13.360buyimg.com/n7/jfs/t29710/357/506891612/310414/cfab2c50/5bf6115dN34a25dd8.jpg"']
len(a)
60

经过测试,网站第一页的所有60张图片的链接已经爬取下来,接下来需要通过下面的方法对这些链接进行加工,使其可用:

'data-lazy-img="//img13.360buyimg.com/n7/jfs/t29710/357/506891612/310414/cfab2c50/5bf6115dN34a25dd8.jpg"'.lstrip('data-lazy-img="').rstrip('"')
'//img13.360buyimg.com/n7/jfs/t29710/357/506891612/310414/cfab2c50/5bf6115dN34a25dd8.jpg'
import re
from urllib import request,error


def crawl(url,page):
    """
    this function craw the images given the url and the page of the web

    args:
        page: int , the number of webpage you want to craw
        url; the start url of the webpage
    """
    # craw the source code of the webpage
    html_source=request.urlopen(url).read()
    # change the code to string,the source code crawled from the webpage is binary code
    html_sourcestr=str(html_source)
    # 选取要爬取得代码段
    pattern1='<div id="plist".+? <div class="page clearfix">'
    result1=re.compile(pattern1).findall(html_sourcestr)
    paragraph=result1[0]
    fh=open(r'D:\pythoncode\crawler\jingdongpic\image\1.txt','w')
    fh.write(paragraph)
    fh.close()
    # 在代码段中选取要爬取的图片链接,正则表达式中()代表提取匹配字符
    pattern2='<img width="220" height="220" data-img="1" (.+?)>'
    imagelist=re.compile(pattern2).findall(paragraph)
    image_url_list=[]
    for image in imagelist:
        # find() 没找到返回-1
        if image.find('src')==-1:
            image_url=image.lstrip('data-lazy-img="').rstrip('"')
        else:
            image_url=image.lstrip('src="').rstrip('"')
        image_url_list.append(image_url)
    fh1=open(r'D:\pythoncode\crawler\jingdongpic\image\2.txt','w')
    for image in image_url_list:
        fh1.write(image+'\n')
    fh1.close()
    x=1
    for image in image_url_list:
        image_file='D:/pythoncode/crawler/jingdongpic/image/'+str(page)+str(x)+'.jpg'
        image_url='http:'+image
        try:
            request.urlretrieve(image_url,filename=image_file)
        except error.URLError as e :
            if hasattr(e,'code'):
                print(e.code)
                x+=1
            elif hasattr(e,'reason'):
                print(e.reason)
                x+=1
        x+=1
    print('there are {} pictures in page {}'.format(x-1,i))


for i in range (1,3):
    url='https://list.jd.com/list.html?cat=9987,653,655&page='+str(i)
    crawl(url,i)
there are 60 pictures in page 1
there are 60 pictures in page 2

re.findall()

Return all non-overlapping matches of pattern in string, as a list of strings. The string is scanned
left-to-right, and matches are returned in the order found. If one or more groups are present in the
pattern, return a list of groups; this will be a list of tuples if the pattern has more than one group.
Empty matches are included in the result.

import re
pattern='m="(bp(y|c)thon)"'
string='abbn="aphp"cjhdm="bpython"_y,abbn="aphp"cjhdm="bpcthon"_y'
result=re.compile(pattern).findall(string)
result
[('bpython', 'y'), ('bpcthon', 'c')]
import re
string='https://blog.csdn.net/CSDNedu/article/details/85118920'
pattern='(https?://[^\s)";]+\.(\w|/)*)'
result=re.compile(pattern).findall(string)
result
[('https://blog.csdn.net/CSDNedu/article/details/85118920', '0')]

链接爬虫

from urllib import request
import re


def getlink(url):
    # 模拟浏览器
    headers=('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36')
    opener=request.build_opener()
    opener.addheaders=[headers]
    # 将opener安装为全局
    request.install_opener(opener)
    file=request.urlopen(url)
    data=str(file.read())
    # 建立链接正则表达式
    pattern='(https?://[^\s)";]+\.(\w|/)*)'
    link=re.compile(pattern).findall(data)
    # 去掉重复元素
    link_list=list(set(link))
    return link_list


url='https://blog.csdn.net/'
link_list=getlink(url)
for link in link_list:
    print(link[0])


https://blog.csdn.net/sD7O95O/article/details/81351586
https://avatar.csdn.net/5/9/8/1_rlnlo2pnefx9c.jpg
https://blog.csdn.net/DP29syM41zyGndVF
http://gitbook.cn/gitchat/activity/5a52e91f5881a96df9f4c02c\r\ngitchat2:http://gitbook.cn/gi...
https://blog.csdn.net/gitchat/article/details/82971479
https://csdnimg.cn/feed/20181224/aa620a1c10fc30a6e5cd5bdbecf8dc0d.png
https://avatar.csdn.net/4/E/4/1_csdnedu.jpg
https://gitbook.cn/gitchat/column/5c0e149eedba1b683458fd5f
https://blog.csdn.net/poem_qianmo/article/details/82731058
https://avatar.csdn.net/D/3/E/1_dp29sym41zygndvf.jpg
https://blog.csdn.net/beliefer/article/details/84998806
https://csdnimg.cn/feed/20181009/7f80d8ea9896099cf92ae677c414c182.png
https://csdnimg.cn/feed/20181011/8b06ac0790a5812087e0af0bc143baf2.png
https://blog.csdn.net/dog250/article/details/82812235
https://blog.csdn.net/yH0VLDe8VG8ep9VGe/article/details/81463851
https://blog.csdn.net/Jmilk
https://csdnimg.cn/feed/20181008/fd5673e4591d508ab7cac672d9e4031c.jpg
https://blog.csdn.net/DP29syM41zyGndVF/article/details/81463793
https://blog.csdn.net/turingbooks/article/details/82995901
https://avatar.csdn.net/7/0/0/1_hhtnan.jpg
https://csdnimg.cn/feed/20181218/4251c768c0e226f157024bfab37b80eb.png
https://csdnimg.cn/feed/20181217/34f4e4b478b98b796a4ab9d5b7024e1d.png
https://csdnimg.cn/feed/20181212/dc27662fe77eddc41c8b157a2e877b40.png
https://blog.csdn.net/DP29syM41zyGndVF/article/details/79990390
https://blog.csdn.net/Java_3y/article/details/82107339
https://blog.csdn.net/DP29syM41zyGndVF/article/details/79990392
https://blog.csdn.net/sunhuaqiang1/article/details/84991520
https://blog.csdn.net/blogdevteam
https://csdnimg.cn/feed/20181217/6e600a03da64bdd60ab485ec3d0f220b.png
https://gitbook.cn/gitchat/column/5ad56a79af8f2f35290f6535
https://csdnimg.cn/feed/20181219/d122a78ef6080af6c94e31063ea6833a.jpg
https://csdnimg.cn/feed/20181212/614e91fd3d235087bcb9e9cc0112ad28.png
https://blog.csdn.net/DP29syM41zyGndVF/article/details/79990372
https://csdnimg.cn/feed/20181218/b46fb2bb97e59d8fedfe1e3bf8e9f2b7.png
http://blog.csdn.net/experts/rule.html
https://blog.csdn.net/blogdevteam/article/details/85164747
https://csdnimg.cn/feed/20181224/37ec7f54dcc8bd84317786cc0b17fd04.png
https://avatar.csdn.net/5/A/F/1_bulprezht1imln4n.jpg
https://blog.csdn.net/CSDNedu
https://avatar.csdn.net/0/D/F/1_sd7o95o.jpg
https://avatar.csdn.net/8/C/E/1_blogdevteam.jpg
https://blog.csdn.net/BULpreZHt1ImlN4N
https://mp.csdn.net/blogmove
https://blog.csdn.net/
https://avatar.csdn.net/C/E/8/1_m2l0zgssvc7r69efdtj.jpg
https://blog.csdn.net/Androidlushangderen/article/details/85058701
https://blog.csdn.net/dog250/article/details/82892267
https://blog.csdn.net/silentwolfyh/article/details/82865579
https://blog.csdn.net/M2l0ZgSsVc7r69eFdTj/article/details/81295203
https://csdnimg.cn/feed/20181212/0786a45d51381df4ae86a3aa785c673d.jpg
https://blog.csdn.net/DP29syM41zyGndVF/article/details/79990339
https://blog.csdn.net/weixin_43430036/article/details/84944372
https://blog.csdn.net/yH0VLDe8VG8ep9VGe
https://csdnimg.cn/feed/20181008/e79279eca0fda46dbfaf319988130033.png
https://csdnimg.cn/feed/20181008/5b7db67ced3b56af839f8b6d85d076a5.jpg
https://avatar.csdn.net/9/D/5/1_jmilk.jpg
http://blog.csdn.net
https://avatar.csdn.net/D/7/D/1_u010870518.jpg
https://blog.csdn.net/mydo/article/details/85200002
https://blog.csdn.net/u010870518
https://csdnimg.cn/feed/20181213/c062c89e428ac9185f1854ee1ea344bd.jpg
https://blog.csdn.net/rlnLo2pNEfx9c/article/details/81295280
https://csdnimg.cn/feed/20181008/64f9e39fd53627fb1b867775a7d6a754.jpg
https://blog.csdn.net/M2l0ZgSsVc7r69eFdTj/article/details/81351371
https://csdnimg.cn/feed/20181224/6c30d429c93965633153d5f663f7a4fe.jpg
https://blog.csdn.net/y80gDg1/article/details/81463731
https://blog.csdn.net/wireless_com/article/details/85003784
https://gitbook.cn/gitchat/column/5ad70dea9a722231b25ddbf8
https://csdnimg.cn/feed/20181224/e7ac7007511d4eefafc26a4253c700e2.png
https://avatar.csdn.net/9/D/E/1_yh0vlde8vg8ep9vge.jpg
https://blog.csdn.net/liumiaocn/article/details/82696501
https://blog.csdn.net/M2l0ZgSsVc7r69eFdTj
https://blog.csdn.net/BULpreZHt1ImlN4N/article/details/81463791
https://blog.csdn.net/HHTNAN
https://csdnimg.cn/feed/20181217/2b530756eedc57f1e57271e1b0fa332c.png
https://csdnimg.cn/feed/20181008/affb704967f5200cddf09dc2ffe8835a.jpg
https://blog.csdn.net/buptgshengod/article/details/85061059
https://blockchain.csdn.net
https://ads.csdn.net/js/async_new.js
https://blog.csdn.net/rlnLo2pNEfx9c
https://blog.csdn.net/y80gDg1
https://blog.csdn.net/u012999985/article/details/80877671
https://avatar.csdn.net/C/A/C/1_y80gdg1.jpg
https://blog.csdn.net/sD7O95O
https://blog.csdn.net/CSDNedu/article/details/85118920
https://blog.csdn.net/qq_34829447/article/details/85042697

糗事百科爬虫

(1)分析网页规律,构造网址变量,利用for循环实现多页内容的爬取。
(2)构建函数getcontent,爬取单个网页的内容,包含两个部分:一部分是对应用户,另一部分是该用户发表的内容。其实现过程:首先模拟浏览器访问该网页内容,针对用户和内容的源代码规律,用beautifulsuop进行匹配提取,将提取出来的内容进行循环处理。
(3)用for循环,多次调用getcontent爬取多个网页。

from urllib import request
import re
from bs4 import BeautifulSoup


def getcontent(url):
    # 模拟浏览器
    headers=('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36')
    opener=request.build_opener()
    opener.addheaders=[headers]
    # 安装全局opener
    request.install_opener(opener)
    # 爬取回的源代码是二进制代码,需要字符串化
    data=request.urlopen(url).read().decode('utf-8')
    # 设置beautifulsoup对象,参数为文件与解析器
    soup=BeautifulSoup(data,'html.parser')
    # return a list of h2 tags 
    usertag_list=soup.find_all('h2')
    user_list=[]
    # .string return the according tag's contents
    for user in usertag_list:
        # string属性只能返回单节点的字符串,如果内容中含有<br/>,需要用get_text()
        user_str=user.string
        # 去点字符串前后的‘\n’,strip返回新字符串,字符串不能更改,所有字符串方法都要返回新的字符串
        user_str=user_str.strip('\n')
        user_list.append(user_str)
    # return a list of span tags with class attribute being 'content'
    contenttag_list=soup.find_all('div',class_='content') 
    content_list=[]
    for content in contenttag_list:
        # 内容中含有<br/>,不能用string属性,需要用get_text()
        # 用.来获取子节点
        content_str=content.span.get_text()
        content_str=content_str.strip('\n')
        content_list.append(content_str)
    for i in range(1,len(user_list)+1):
        print('用户{}{}发表的内容是:\n{}'.format(str(i),user_list[i-1],content_list[i-1]))


base_url='https://www.qiushibaike.com/text/page/'
for i in range (1,3):
    url=base_url+str(i)
    print('第{}页:'.format(str(i)))
    getcontent(url)

    

第1页:
用户1晴~风发表的内容是:
凌晨一点半一个人走在巷子里,脚下的树叶上都是冰渣子。一脚踩上去,发出一阵一阵咔嚓声。身后不远处传来高跟鞋的声音,脚步有点凌乱,凭经验感觉这人喝的不少。泥鳅站在旁边的黑暗里,想要看看到底是女装大佬还是酒吧出来的妖艳贱货,或许还能捡点福利。几分钟后一道的身影总算出现在泥鳅的视线里。性~感,妖~娆长发,满足了泥鳅的一切幻想。泥鳅从黑暗走出来对妹子说这么晚一个人不安全,我送你回家吧。妹子点点头笑了笑轻轻的把头靠在了泥鳅肩头。说我叫庞香。泥鳅大惊撒腿就跑,卧槽。。。
用户2空城旧梦她与伞发表的内容是:
老板家有两个女儿,他有心要我入赘他家。昨天去他家,见到了他两个女儿。大女儿身高腿长胸却略平,小女儿身材娇小却有一副傲人的胸。他大女儿热情走到我面前说:“我很高~高兴认识你!”还故意把高字拉长音。他小女儿把大胸一挺,微笑着说:“我挺~挺开心认识你!”挺字也故意拉长音。这特么让我怎么选啊……
用户3胖香发表的内容是:
...
第2页:
用户1无书斋主发表的内容是:
和老婆回家的时间,遇见隔壁栋一堂客。这货问画画:今天圣诞节,正哥给你买什么了?你看我老公给我买的衣服、包包、化妆品,说着冲画画扬起手中的袋子……我赶忙接话:嫂子,你又不是不知道,我哪有你家老李厉害,衣服是夏奈尔还是范思哲的呀?包包LV的吧?化妆品是兰蔻吧?今天中午我看见老李买的,有钱就是好……说完,我拉起画画走了……她在打电话:老李,你给老娘送这垃圾货……
用户2我是煮茶发表的内容是:
一个宝妈来健身房健身塑型,要练一组哑铃推举的动作,女教练跟她说做完,今晚 胸 部可能会很酸,可以用热水敷一下。那宝妈说:“没事,那宝宝今晚有酸奶可以喝!有益宝宝肠道健康...”我当时在练深蹲,扛着杠铃一下子气息不稳,差点整成内伤...
用户3那谁一期一会发表的内容是:
...
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值