修订版 | ba2d3f43ff57f97907be9b5fdccef11aa6e30fda (tree) |
---|---|
时间 | 2007-09-16 01:16:12 |
作者 | henoheno <henoheno> |
Commiter | henoheno |
$Id: spam_pickup.php,v 1.60 2007/09/15 15:55:29 henoheno Exp $
* spam_uri_removing_hocus_pocus(): Remove/Replace quoted-spaces within tags
* [img][email]
* technorati.com/blogs/
@@ -1,5 +1,5 @@ | ||
1 | 1 | <?php |
2 | -// $Id: spam_pickup.php,v 1.3 2007-08-26 14:27:02 henoheno Exp $ | |
2 | +// $Id: spam_pickup.php,v 1.4 2007-09-15 16:16:12 henoheno Exp $ | |
3 | 3 | // Copyright (C) 2006-2007 PukiWiki Developers Team |
4 | 4 | // License: GPL v2 or (at your option) any later version |
5 | 5 | // |
@@ -547,7 +547,7 @@ function area_pickup($string = '', $method = array()) | ||
547 | 547 | // [OK] [link]http://nasty.example.com/[/link] |
548 | 548 | // [OK] [url=http://nasty.example.com]visit http://nasty.example.com/[/url] |
549 | 549 | // [OK] [link http://nasty.example.com/]buy something[/link] |
550 | - $regex = '#\[(url|link)\b[^\]]*\].*?\[/\1\b[^\]]*(\])#is'; | |
550 | + $regex = '#\[(url|link|img|email)\b[^\]]*\].*?\[/\1\b[^\]]*(\])#is'; | |
551 | 551 | if (isset($method['area_bbcode'])) { |
552 | 552 | $areas = array(); |
553 | 553 | $count = isset($method['asap']) ? |
@@ -605,9 +605,12 @@ function area_measure($areas, & $array, $belief = -1, $a_key = 'area', $o_key = | ||
605 | 605 | // --------------------- |
606 | 606 | // Spam-uri pickup |
607 | 607 | |
608 | -// Preprocess: Removing uninterest part for URI detection | |
608 | +// Preprocess: Removing/Modifying uninterest part for URI detection | |
609 | 609 | function spam_uri_removing_hocus_pocus($binary = '', $method = array()) |
610 | 610 | { |
611 | + $from = $to = array(); | |
612 | + | |
613 | + // Remove sequential spaces and too short lines | |
611 | 614 | $length = 4 ; // 'http'(1) and '://'(2) and 'fqdn'(1) |
612 | 615 | if (is_array($method)) { |
613 | 616 | // '<a'(2) or 'href='(5) or '>'(1) or '</a>'(4) |
@@ -616,14 +619,17 @@ function spam_uri_removing_hocus_pocus($binary = '', $method = array()) | ||
616 | 619 | isset($method['area_bbcode']) || isset($method['uri_bbcode'])) |
617 | 620 | $length = 1; // Seems not effective |
618 | 621 | } |
619 | - | |
620 | - // Removing sequential spaces and too short lines | |
621 | 622 | $binary = strings($binary, $length, TRUE, FALSE); // Multibyte NOT needed |
622 | 623 | |
624 | + // Remove/Replace quoted-spaces within tags | |
625 | + $from[] = '#(<\w+ [^<>]*?\w ?= ?")([^"<>]*? [^"<>]*)("[^<>]*?>)#ie'; | |
626 | + $to[] = "'$1' . str_replace(' ' , '%20' , trim('$2')) . '$3'"; | |
627 | + | |
623 | 628 | // Remove words (has no '<>[]:') between spaces |
624 | - $binary = preg_replace('/[ \t][\w.,()\ \t]+[ \t]/', ' ', $binary); | |
629 | + $from[] = '/[ \t][\w.,()\ \t]+[ \t]/'; | |
630 | + $to[] = ' '; | |
625 | 631 | |
626 | - return $binary; | |
632 | + return preg_replace($from, $to, $binary); | |
627 | 633 | } |
628 | 634 | |
629 | 635 | // Preprocess: Domain exposure callback (See spam_uri_pickup_preprocess()) |
@@ -680,7 +686,6 @@ function spam_uri_pickup_preprocess($string = '', $method = array()) | ||
680 | 686 | ); |
681 | 687 | |
682 | 688 | $string = spam_uri_removing_hocus_pocus($string, $method); |
683 | - //var_dump(htmlspecialchars($string)); | |
684 | 689 | |
685 | 690 | // Domain exposure (simple) |
686 | 691 | // http://victim.example.org/nasty.example.org/path#frag |
@@ -697,7 +702,8 @@ function spam_uri_pickup_preprocess($string = '', $method = array()) | ||
697 | 702 | 'big5.xinhuanet.com/gate/big5/' . '|' . |
698 | 703 | 'bhomiyo.com/en.xliterate/' . '|' . |
699 | 704 | 'google.com/translate_c\?u=(?:http://)?' . '|' . |
700 | - 'web.archive.org/web/2[^/]*/(?:http://)?' . | |
705 | + 'web.archive.org/web/2[^/]*/(?:http://)?' . '|' . | |
706 | + 'technorati.com/blogs/' . | |
701 | 707 | ')' . |
702 | 708 | '([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' . // nasty.example.org |
703 | 709 | '#i', |