代码语言
.
CSharp
.
JS
Java
Asp.Net
C
MSSQL
PHP
Css
PLSQL
Python
Shell
EBS
ASP
Perl
ObjC
VB.Net
VBS
MYSQL
GO
Delphi
AS
DB2
Domino
Rails
ActionScript
Scala
代码分类
文件
系统
字符串
数据库
网络相关
图形/GUI
多媒体
算法
游戏
Jquery
Extjs
Android
HTML5
菜单
网页交互
WinForm
控件
企业应用
安全与加密
脚本/批处理
开放平台
其它
【
PHP
】
特定爬虫程序备份
作者:
韩凌飞
/ 发布于
2015/8/19
/
540
用于爬取一个网站上面的黄页信息
<?PHP error_reporting(E_ALL^E_NOTICE); $conn = @ mysql_connect("localhost", "ruzhouren", "ruzhourenmysql"); mysql_select_db("ruzhouren", $conn); mysql_query("set names 'utf8'"); date_default_timezone_set("ETC/GMT-8"); $day_date=date('Y-m-d'); $time_date=date('Y-m-d H:i:s'); $time_dateline=time(); //$start_num_query=mysql_query("SELECT * FROM `rzr_wsq_rzbst_num` where `id`= 1 "); $start_num=mysql_fetch_array($start_num_query); function DeleteHtml($str) { $str = trim($str); $str = preg_replace("/\t/","",$str); $str = preg_replace("/\r\n/","",$str); $str = preg_replace("/\r/","",$str); $str = preg_replace("/\n/","",$str); $str = preg_replace("/ /","",$str); $str = preg_replace("/ /","",$str); return trim($str); } for($a=170;$a<1000000;$a++){ $url = "http://www.rz0375.com/phonebook/all/pn{$a}/"; $fp = @fopen($url, "r") or die("timeout"); $fcontents = file_get_contents($url); $fcontents = iconv("GB2312//IGNORE", "UTF-8",trim($fcontents)); @preg_match("/target=\"_blank\" >(.*)<\/a><\/span>/ms", $fcontents, $li_span); $exp_li_span = explode("<li>",$li_span[0]); foreach($exp_li_span as $span){ $exp_http_1 = explode("http://",$span); if(count($exp_http_1) <3 ){ $exp_http_2 = explode("\">",$exp_http_1[1]); }else{ $exp_http_2 = explode("\" target=\"_blank\"",$exp_http_1[1]); } $url_con = "http://".$exp_http_2[0]; $http = preg_replace('/www.rz0375.com\/phonebook\//','',$exp_http_2[0]); $fp = @fopen($url_con, "r"); if(! $fp){ continue; } $fcontent_url = file_get_contents($url_con); $fcontent02 = iconv("GB2312//IGNORE", "UTF-8",trim($fcontent_url)); @preg_match("/<div class=\"con f14 fa\">(.*)<\/div>/mUs", $fcontent02, $url_contron); $contron_str = preg_replace('/<span class="ewm">(.*?)<\/span>/','',$url_contron[1]); $contron_str = preg_replace('/<span class="yp">(.*?)<\/span>/','',$contron_str); $contron_str_exp = explode("",$contron_str); foreach($contron_str_exp as $li_exp_span){ $li_exp_span = DeleteHtml($li_exp_span); $li_exp_sub = mb_substr($li_exp_span,0,2,'utf-8'); if($li_exp_sub == "名称"){ $li_span = strip_tags($li_exp_span); $name = explode(":",$li_span); } if($li_exp_sub == "电话"){ $strip_str_phone = strip_tags($li_exp_span); $phone = explode(":",$strip_str_phone); } if($li_exp_sub == "手机"){ $strip_str_tel = strip_tags($li_exp_span); $tel = explode(":",$strip_str_tel); } if($li_exp_sub == "地址"){ $strip_str_addres = strip_tags($li_exp_span); $addres = explode(":",$strip_str_addres); } if($li_exp_sub == "标签"){ $strip_str_tag = strip_tags($li_exp_span); $tag = explode(":",$strip_str_tag); } } $insert_rs = mysql_query("INSERT INTO `ruzhouren`.`rzrmh_tel` (`id`, `name`, `status`, `order`, `url`, `vip`, `tel`, `tel2`, `addr`, `add_uid`, `own_uid`, `add_dateline`, `type`, `http`) VALUES (NULL, '$name[1]', '1', '1', '0', '1', '$phone[1]', '$tel[1]', '$addres[1]', '1', '1', '$time_dateline', '0', '$http');"); if($insert_rs){ echo "insert-----$name[1] Succeed!"."\n"; } //print_r($contron_str_exp); } sleep(5); //print_r($pre_li); /* v1.0 foreach($exp_li_span as $span){ $ex_url = "<span class='yp'>"; $tow_span = explode("$ex_url",$span); $string = preg_replace('/<span class="comp"><a(.*?)href="(.*?)" target="_blank" >/','',$tow_span[0]); $string = preg_replace('/<\/a><\/span>/','',$string); $string_ok = preg_replace('/target="_blank" >/','',$string); $span_2 = explode(".html",$tow_span[1]); if(count($span_2) >2 ){ $span_exp = $span_2[2]; }else{ $span_exp = $span_2[1]; } $string2 = preg_replace('/<\/a><\/span><\/li>/','',$span_exp); $string2 = preg_replace('/<\/a><\/span>/','',$string2); $string2_ok = preg_replace('/">/','',$string2 ); print $string_ok."------".$string2_ok; }*/ sleep(10); } ?>
试试其它关键字
爬虫程序
同语言下
.
用net匹配并替换iOS标准的emoji表情符号
.
处理带Emoji表情的的字符串
.
获取微信昵称时 过滤特殊字符
.
通过判断上传文件的头字符来判断文件的类型
.
模拟百度URL加密解密算法
.
以太坊检查地址是否合法
.
实现crontab解析类
.
获取每个月的开始和结束时间
.
图片上传工具类
.
APP手机应用信息采集
可能有用的
.
C#实现的html内容截取
.
List 切割成几份 工具类
.
SQL查询 多列合并成一行用逗号隔开
.
一行一行读取txt的内容
.
C#动态修改文件夹名称(FSO实现,不移动文件)
.
c# 移动文件或文件夹
.
c#图片添加水印
.
Java PDF转换成图片并输出给前台展示
.
网站后台修改图片尺寸代码
.
处理大图片在缩略图时的展示
韩凌飞
贡献的其它代码
(
1
)
.
特定爬虫程序备份
Copyright © 2004 - 2024 dezai.cn. All Rights Reserved
站长博客
粤ICP备13059550号-3