火车头采集器伪原创（附PHP实现代码）

因为最近需要一批数据来做机器学习，所以用火车头采集器来抓数据，数据伪原创用的小发猫的API。以下是PHP实现代码：

<?php

set_time_limit(270);

error_reporting(E_ERROR | E_WARNING | E_PARSE);

define(‘TITLE_SEPAR’, ‘xxx**xxx’);

define(‘TITLE_SEPAR2’, ‘262661’);

$url = ‘http://api-6.xiaofamao.com/api.php?json=0&v=1&key=’;

$content_tag_name = ‘内容’;

$headdd = ‘<figure class=”wp-block-gallery columns-3 is-cropped”><ul class=”blocks-gallery-grid”>’;

$taill = ‘</figure>’;

switch($LabelArray[‘PageType’])

{

case ‘List’://处理列表页，只能处理html

break;

case ‘Pages’://处理多页，只能处理html

break;

case ‘Content’://处理默认页，只能处理html

break;

case ‘Save’://只有保存时是可以处理标签值的

// 保存原文

try {

/**********************************************************************/

// 这一步用来获取伪原创文章

/**********************************************************************/

$title = $LabelArray[‘标题’];

$content = $LabelArray[$content_tag_name];

$article_src = compose_article($title, $content);

$article_src_b = $article_src;

//$article_src = br2newline($article_src);

$article_new = get_wyc_article($article_src);

$title_wyc = trim($article_new[0]);

$content_wyc = trim($article_new[1]);

//$article_new_x = $article_new;

//$article_new = fix_newline($article_new);

//$temp = explode(TITLE_SEPAR, $article_new);

//$new_title = $temp[0];

//$new_title = fix_title($new_title);

/*

$temp[1] = ltrim($temp[1], “\r\n”);//

$temp[1] = ltrim($temp[1], “\n”);

$temp[1] = ltrim($temp[1], “\r\n”);//implode(PHP_EOL, $temp);

$temp[1] = ltrim($temp[1], “\n”);*/

//$new_article = get_wyc_article($LabelArray[$content_tag_name]);

$content_wyc = fix_newline($content_wyc);

// $new_article = newline2br($new_article);

//$new_article = remove_alt($new_article);

//$article_new = xfm_strong_str_replace_once(‘’, ‘’.$new_title, $new_article);

//$LabelArray[$content_tag_name] = $article_new;//$new_article;//$new_article;

//$nlp = get_keywords($new_title, $new_article);

//$nlp_arr = explode(TITLE_SEPAR, $nlp);

//$LabelArray[‘关键词’] = $nlp_arr[0];

//$LabelArray[‘内容简介’] = $nlp_arr[1];

//$LabelArray[‘内容简介’] = curl_request($url, array(‘wenzhang’=>$LabelArray[‘内容简介’]));

$content_wyc = ltrim($content_wyc, ‘’);

//$LabelArray[$content_tag_name] = $headdd. $content_wyc. $taill; //serialize($article_new);

// $LabelArray[$content_tag_name] = $temp[1];

//$LabelArray[$content_tag_name] = $article_src;

$new_title = str_replace(array(‘[‘,’]’,’％’), array(‘【’,’】’,’%’), $new_title);

$LabelArray[‘标题’] = strip_tags($title_wyc);

$LabelArray[‘标题’] = ltrim($LabelArray[‘标题’]);

$LabelArray[‘标题’] = trim($LabelArray[‘标题’]);

//$LabelArray[‘摘要’] = curl_request($url, array(‘wenzhang’=>$LabelArray[‘标题’].’,’.$LabelArray[‘摘要’]));

}

catch (Exception $e) {

$LabelArray[‘标题’] .= $e->getMessage();

$LabelArray[$content_tag_name] .= $e->getMessage();

}

break;

default:

//$LabelArray[$content_tag_name]=curl_request($url, array(‘wenzhang’=>$LabelArray[$content_tag_name] ));

}

echo serialize($LabelArray);

function compose_article($title, $content) {

$separator = compose_separator();

return $title.$separator.$content;

}

function compose_separator() {

return PHP_EOL.'(‘.TITLE_SEPAR2.’)’.PHP_EOL;

}

function fix_separator($article) {

return $article;

}

function get_wyc_article($str) {

global $url;

$separator = compose_separator();

$separator = str_replace(PHP_EOL, ”, $separator);

$wyc = curl_request($url, array(‘wenzhang’=>$str));

$wyc = fix_separator($wyc);

$wyc = explode($separator, $wyc);

if (isset($wyc[0])) $wyc[0] = trim($wyc[0]);

if (isset($wyc[1])) $wyc[1] = trim($wyc[1]);

return $wyc;

}

function get_wyc_title($str) {

$title = get_wyc_article($str.PHP_EOL.PHP_EOL.PHP_EOL.$str.PHP_EOL.PHP_EOL.PHP_EOL.$str);

$title = fix_newline($title);

$title = explode(PHP_EOL, $title);

return $title[0];

}

function get_keywords($title, $contents) {

$url_kw = ‘http://api-2.78tp.com/nlp/kws.php?appid=’;

$kws = curl_request($url_kw, array(

‘title’=>$title,

‘len’=>100,

‘text’=>$contents));

return $kws;

}

function remove_alt($contents) {

$contents = preg_replace(‘/alt=\”(.*)\”/’, ”, $contents);

return $contents;

}

function fix_title($contents) {

$punctuation_symbol = array(‘。’, ‘？’, ‘，’, ‘：’, ‘；’, ‘、’, ‘！’,

‘.’, ‘?’, ‘,’, ‘:’, ‘;’, ‘!’);

$contents = str_replace($punctuation_symbol, ”, $contents);

return $contents;

}

function br2newline($contents) {

$contents = str_replace(‘ ’, PHP_EOL, $contents);

$contents = str_replace(‘ ’, PHP_EOL, $contents);

$contents = str_replace(‘ ’, PHP_EOL, $contents);

$contents = str_replace(‘ ’, PHP_EOL, $contents);

$contents = str_replace(‘ ’, PHP_EOL, $contents);

$contents = str_replace(‘ ’, PHP_EOL, $contents);

return $contents;

}

function newline2br($contnets) {

$contnets = str_replace(PHP_EOL, “ ”, $contnets);

// $contnets = str_replace(‘> <‘, ‘><‘, $contnets);

$contnets = str_replace(‘ ’, ‘’, $contnets);

return $contnets;

}

function delete_newline($contents) {

$contents = fix_newline($contents);

// $contents = str_replace(PHP_EOL.PHP_EOL, PHP_EOL, $contents);

// $contents = str_replace(‘>’.PHP_EOL, ‘>’, $contents);

return $contents;

}

function reset_newline_win($contents) {

// 优化换行符

$contents = str_replace(“\r\n”, “\n”, $contents);

$contents = str_replace(“\r”, “\n”, $contents);

$contents = str_replace(“\n”, PHP_EOL, $contents);

return $contents;

}

function fix_newline($data) {

$data = str_replace(“\r”, “\n”, $data);

while(strpos($data, “\n\n”) !== false) {

$data = str_replace(“\n\n”, “\n”, $data);

}

$data = str_replace(“\n”, PHP_EOL, $data);

return $data;

}

function clean_contents($contents) {

// $str = preg_replace(‘#<([^>\s/]+)[^>]*>#’,'<$1>’, $contents);

// return $str;

$sa = new cleanHtml;

$sa->allow = array( ‘src’ );

$sa->exceptions = array(

‘img’ => array( ‘src’, ‘alt’ ),

//’a’ => array( ‘href’, ‘title’ ),

‘iframe’=>array(‘src’,’frameborder’),

);

$str = $sa->strip( $contents );

return $str;

}

function xfm_strong_str_replace_once($search, $replace, $subject) {

$firstChar = strpos($subject, $search);

if($firstChar !== false) {

$beforeStr = substr($subject,0,$firstChar);

$afterStr = substr($subject, $firstChar + strlen($search));

return $beforeStr.$replace.$afterStr;

} else {

return $subject;

}

}

//参数1：访问的URL，参数2：post数据(不填则为GET)，参数3：提交的$cookies,参数4：是否返回$cookies

function curl_request($url,$post=”,$cookie=”, $returnCookie=0){

if (! extension_loaded(‘curl’)) {

file_exists(‘./ext/php_curl.dll’) && dl(‘php_curl.dll’); // 加载扩展

}

$curl = curl_init();

curl_setopt($curl, CURLOPT_URL, $url);

curl_setopt($curl, CURLOPT_USERAGENT, ‘Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)’);

if (ini_get(‘open_basedir’) == ” && strtolower(ini_get(‘safe_mode’)) != ‘on’){

curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);

}

curl_setopt($curl, CURLOPT_AUTOREFERER, 1);

curl_setopt($curl, CURLOPT_REFERER, “http://XXX”);

if($post) {

curl_setopt($curl, CURLOPT_POST, 1);

curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post));

}

if($cookie) {

curl_setopt($curl, CURLOPT_COOKIE, $cookie);

}

curl_setopt($curl, CURLOPT_HEADER, $returnCookie);

curl_setopt($curl, CURLOPT_TIMEOUT, 150);

curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);

$data = curl_exec($curl);

if (curl_errno($curl)) {

return curl_error($curl);

}

curl_close($curl);

if($returnCookie){

list($header, $body) = explode(“\r\n\r\n”, $data, 2);

preg_match_all(“/Set\-Cookie:([^;]*);/”, $header, $matches);

$info[‘cookie’] = substr($matches[1][0], 1);

$info[‘content’] = $body;

return $info;

}else{

return $data;

}

}

//echo $tag;

// 计算中文字符串长度

function utf8_strlen($string = null) {

// 将字符串分解为单元

preg_match_all(“/./us”, $string, $match);

// 返回单元个数

return count($match[0]);

}

function reg_escape( $str )

{

$conversions = array( “^” => “\^”, “[” => “\[“, “.” => “\.”, “$” => “\$”, “{” => “\{“, “*” => “\*”, “(” => “$“, “\\” => “\\\\”, “/” => “\/”, “+” => “\+”, “)” => “$”, “|” => “\|”, “?” => “\?”, “<” => “\<“, “>” => “\>” );

return strtr( $str, $conversions );

}

/**

* Strip attribute Class

* Remove attributes from XML elements

* @author David (semlabs.co.uk)

* @version 0.2.1

*/

class cleanHtml{

public $str = ”;

public $allow = array();

public $exceptions = array();

public $ignore = array();

public function strip( $str )

{

$this->str = $str;

if( is_string( $str ) && strlen( $str ) > 0 )

{

$res = $this->findElements();

if( is_string( $res ) )

return $res;

$nodes = $this->findAttributes( $res );

$this->removeAttributes( $nodes );

}

return $this->str;

}

private function findElements()

{

# Create an array of elements with attributes

$nodes = array();

preg_match_all( “/<([^ !\/\>\n]+)([^>]*)>/i”, $this->str, $elements );

foreach( $elements[1] as $el_key => $element )

{

if( $elements[2][$el_key] )

{

$literal = $elements[0][$el_key];

$element_name = $elements[1][$el_key];

$attributes = $elements[2][$el_key];

if( is_array( $this->ignore ) && !in_array( $element_name, $this->ignore ) )

$nodes[] = array( ‘literal’ => $literal, ‘name’ => $element_name, ‘attributes’ => $attributes );

}

}

# Return the XML if there were no attributes to remove

if( !$nodes[0] )

return $this->str;

else

return $nodes;

}

private function findAttributes( $nodes )

{

# Extract attributes

foreach( $nodes as &$node )

{

preg_match_all( “/([^ =]+)\s*=\s*[\”|’]{0,1}([^\”‘]*)[\”|’]{0,1}/i”, $node[‘attributes’], $attributes );

if( $attributes[1] )

{

foreach( $attributes[1] as $att_key => $att )

{

$literal = $attributes[0][$att_key];

$attribute_name = $attributes[1][$att_key];

$value = $attributes[2][$att_key];

$atts[] = array( ‘literal’ => $literal, ‘name’ => $attribute_name, ‘value’ => $value );

}

}

else

$node[‘attributes’] = null;

$node[‘attributes’] = $atts;

unset( $atts );

}

return $nodes;

}

private function removeAttributes( $nodes )

{

# Remove unwanted attributes

foreach( $nodes as $node )

{

# Check if node has any attributes to be kept

$node_name = $node[‘name’];

$new_attributes = ”;

if( is_array( $node[‘attributes’] ) )

{

foreach( $node[‘attributes’] as $attribute )

{

if( ( is_array( $this->allow ) && in_array( $attribute[‘name’], $this->allow ) ) || $this->isException( $node_name, $attribute[‘name’], $this->exceptions ) )

$new_attributes = $this->createAttributes( $new_attributes, $attribute[‘name’], $attribute[‘value’] );

}

}

$replacement = ( $new_attributes ) ? “<$node_name $new_attributes>” : “<$node_name>”;

$this->str = preg_replace( ‘/’. reg_escape( $node[‘literal’] ) .’/’, $replacement, $this->str );

}

}

private function isException( $element_name, $attribute_name, $exceptions )

{

if( array_key_exists($element_name, $this->exceptions) )

{

if( in_array( $attribute_name, $this->exceptions[$element_name] ) )

return true;

}

return false;

}

private function createAttributes( $new_attributes, $name, $value )

{

if( $new_attributes )

$new_attributes .= ” “;

$new_attributes .= “$name=\”$value\””;

return $new_attributes;

}

}

?>

我们选择方法1:“保存到软件数据库”，同时，选择模式3“网上发布到网站”的“使用自定义发布方式”，选择3“自定义分类标识”，将任务命名为“房地产”，将收藏任务命名为“保存并更新”。由于我们的教程刚刚开始，我们不会做深入的研究。

返回机车主界面，在“房地产”任务上点击鼠标右键，选择“开始”完成采集。收集的数据将自动发布到模式3中指向的网站的指定列(标识=3)，并保存到：机车安装目录/数据/序列号-任务名称/蜘蛛结果. mdb在的数据库中。

哦，昨天网络给了我一个关于我的错误的提示，我必须写文案，录像，并收集信息到我的网站3个小时。我晕倒过几次。太仓的作品很粗糙。这完全是凭感觉写的。这让雾中的每个人都很困惑。对不起，请原谅我！现在更正以下内容：

这里，方法1和方法3是并行关系，可以同时选择，也可以选择其中一个，如果不发布模块，可以直接收集本地软件数据库。“本地软件数据库”来自微软Access。我们可以打开数据库来浏览和检查数据。

至于模式3，“火车头采集器伪原创”，我将在下面的教程中解释。我希望每个人都能耐心等待。

好了，本教程到此结束！下一课，再见！

特别声明：☆ 本站所有资源仅供学习和研究之用，严禁用于任何商业目的。 ☆ 我们仅提供资源下载，不包含安装、调试等技术支持服务。 ☆ 所有内容均来源于网络，本站不对资源的完整性、可用性或安全性作出任何承诺。 ☆ 请勿将本站资源用于任何违法违规行为，由此产生的后果由使用者自行承担。 ☆ 若您不同意上述声明，请立即停止使用本站内容与服务。 ☆ 涉及付费或赞助资源，请务必自行甄别并谨慎选择。 ☆ 若有内容侵犯您的合法权益，请联系我们，我们将及时处理下架。 ☆ 所有模板或源码需具备一定开发知识，新手建议选购官方正版服务。

火车头采集器伪原创（附PHP实现代码）

猜你喜欢

打造酷炫网站必备！10款最热门的苹果CMS模板推荐

打造精美网站，选择苹果CMS模板从容实现

打造个性网站首选！探索最新苹果CMS模板趋势

苹果CMS模板: 打造独具创意的个人博客！

告别繁琐编程，用苹果CMS模板轻松搭建个性网站

苹果CMS模板带你领略全新网站视觉盛宴

评论0