我用curl模拟了浏览器,但是最后发现抓取的所有的html,怎么识别都是utf8的编码。不知道是我程序的问题还是curl就是如此。
但是我用file_get_contents就可以解决这个问题。
- function get_html($url)
- {
- $ch = curl_init();
- curl_setopt ($ch,CURLOPT_URL,$url);
- curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
- curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4) Gecko/20030624 Netscape/7.1 (ax)");
- $res = curl_exec($ch);
- curl_close ($ch);
- $html=$res;
- //echo mb_detect_encoding($html); //判断编码
- //echo mb_detect_encoding($html,array('ascii','utf-8','gbk','cp936')).'<br>';
- // mb_detect_encoding($str,array('ascii','utf-8','gbk','cp936'));
- $html=mb_convert_encoding($html, "UTF-8", "gb2312");
- return $html;
- }