function one_crawl($form, $form_state) { $link = 'http://blog.eau-thermale-avene.cn/post/536.html'; blog_crawler($link); } //save url content to node:blog. function blog_crawler($link, &$context = array(), $summary = NULL) { if (empty($link)) return; watchdog('crawler link', $link); $default = array( 'uid' => 1, ); $fetch = new BlogFetch('blog', $link, $default); $blog = str_get_html($fetch->html); //Blog Pushlish date. $date = $blog->find('#divMain h2.post-title span.post-date', 0)->plaintext; $blog->find('#divMain h2.post-title span.post-date', 0)->innertext = ''; //dsm($date); $fetch->set('created', strtotime($date)); //Entity property changed doesn't support writing。。。 //$fetch->set('changed', strtotime($date)); //Blog Title. $blog->find('#divMain h2.post-title span', 0)->outertext = ''; $title = $blog->find('#divMain h2.post-title', 0)->innertext; $title = str_replace('Q&A', 'Q&A', $title); $fetch->set('title', $title); //Blog Original URL. $fetch->set('field_link', array('url' => $link)); //Blog Tags. $terms = array(); $tags = $blog->find('#divMain div.post-body .post-info a'); foreach ($tags as $tag) { if(!empty($tag->plaintext)) { $terms[] = avene_taxonomy($tag->plaintext, 'blog_tags'); } } $fetch->set('field_blog_tags', $terms); //Blog Category.. $cate = drupal_substr(trim($blog->find('#divMain div.post-body .post-footer-category', 0)->plaintext),3); $term = avene_taxonomy($cate, 'blog_category'); $fetch->set('field_blog_type', $term); //Blog Body. $blog->find('#divMain div.post-body .post-info', 0)->outertext = ''; $blog->find('#divMain div.post-body .post-footer', 0)->outertext = ''; $body = $blog->find('#divMain div.post-body', 0)->innertext; $fetch->set('body', array('format' => 'full_html', 'summary' => $summary, 'value' => $body)); //Blog comments... foreach ($blog->find('.msg-boxes .msg-box-content') as $c) { if(empty($c->plaintext)) return; $comment->nid = $fetch->entity->nid; // nid of a node you want to attach a comment to $comment->cid = 0; // leave it as is $comment->pid = 0; // parent comment id, 0 if none $comment->uid = 0; // user's id, who left the comment //$comment->mail = 'email@example.com'; // user's email //$comment->name = 'User name'; // If user is authenticated you can omit this field, it will be auto-populated, if the user is anonymous and you want to name him somehow, input his name here //$comment->thread = '01/'; // OPTIONAL. If you need comments to be threaded you can fill this value. Otherwise omit it. $c->find('.msgtime a', 0)->outertext = ''; $comment->created = strtotime(trim($c->find('.msgtime', 0)->plaintext)); $comment->is_anonymous = 1; // leave it as is $comment->status = COMMENT_PUBLISHED; // We auto-publish this comment $comment->language = LANGUAGE_NONE; // The same as for a node $comment->subject = ''; $val = filter_var($c->find('.msgarticle', 0)->plaintext, FILTER_SANITIZE_SPECIAL_CHARS); //$val = preg_replace('/[^(\x20-\x7F)]*/','', $val); $replace = array( ''=>''); $val = strtr($val, $replace); $comment->comment_body[$comment->language][0]['value'] = $val; $comment->comment_body[$comment->language][0]['format'] = 'full_html'; comment_submit($comment); // saving a comment comment_save($comment); } //$context['results']['processed']++; $context['message'] = 'fetching ' . $fetch->entity->title; } class BlogFetch { protected $type; protected $url; protected $args; var $html; var $entity; function __construct($type, $url, $args = array()) { $this->type = $type; $this->args = $args; $this->url = $url; $this->fetchData(); $this->buildEntity(); } //Init entity... function buildEntity() { $args = array('type' => $this->type) + $this->args; $this->entity = entity_create('node', $args); } function fetchData() { $request = drupal_http_request($this->url); if ($request->code == 200) { $this->html = $request->data; } else { throw new Exception('Failure on fetch:' . $this->url . '. http code:' . $request->code); return FALSE; } } public function set($property, $value, $type = NULL) { $wrapper = entity_metadata_wrapper('node', $this->entity); $wrapper->{$property}->set($value); $wrapper->save(); } function __destruct() { //$this->html->clear(); $this->html = NULL; $this->entity = NULL; } }
抓取网页数据导入到entity
最新推荐文章于 2020-08-12 15:39:05 发布