用python抓一了一些数据存到本地

最新推荐文章于 2024-08-23 16:53:34 发布

a54349463

最新推荐文章于 2024-08-23 16:53:34 发布

阅读量142

点赞数

文章标签： python

原文链接：http://www.cnblogs.com/Conker/p/6820345.html

版权

import codecs

from xml.dom.minidom import Document
import requests
from bs4 import BeautifulSoup

doc = Document()
def getAllUrl(pageCount):
    url='https://www.xxx.co/xxxx/{page}'
    return  url.format(page=pageCount)

def getHtml(pageCount):
    html = requests.get(getAllUrl(pageCount))
    return html

def WirteXml(gName,gImg,wUrl):
    girlName = gName
    girlImage = gImg
    webUrl = wUrl
    name = doc.createElement("name")
    aperson.appendChild(name)
    personname = doc.createTextNode(girlName)
    name.appendChild(personname)
    img = doc.createElement("imgUrl")
    aperson.appendChild(img)
    prersonUrl = doc.createTextNode(girlImage)
    img.append.Child(prersonUrl)
    weburl = doc.createElement("webUrl")
    aperson.appendChild(weburl)
    personname = doc.createTextNode(webUrl)
    weburl.appendChild(personname)

if __name__ == '__main__':
   # f = codecs.open('Conker.txt', 'w', 'utf-8')
    filename = "people.xml"
    f = codecs.open(filename, "w", 'utf-8')
    people = doc.createElement("Actresses")
    doc.appendChild(people)
    aperson = doc.createElement("person")
    people.appendChild(aperson)
    for count in range(1,1250):
      html = getHtml(count).text
      soup= BeautifulSoup(html,"lxml")
      trs=soup.findAll("img")
      length=len(trs)
      for i in range(length):
        try:
            girlName = trs[i].attrs["title"]
            girlImage = trs[i].attrs["src"]
            webUrl ="https://www.xxx.co/xx/"+trs[i].attrs["src"].split('/')[-1][:-6]
            WirteXml(girlName,girlImage,webUrl)
        except:
           None
      print("第"+str(count)+"页抓完！！！")
    f.write(doc.toprettyxml(indent="  "))
    f.close()

转载于:https://www.cnblogs.com/Conker/p/6820345.html

a54349463

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
用python抓一了一些数据存到本地

import codecsfrom xml.dom.minidom import Documentimport requestsfrom bs4 import BeautifulSoupdoc = Document()def getAllUrl(pageCount): url='https://www.xxx.co/xxxx/{page}' ...
复制链接

扫一扫