修订版 | 1cca23a1aa055699937396aa3f773124b9d3fc79 (tree) |
---|---|
时间 | 2007-08-26 23:27:02 |
作者 | henoheno <henoheno> |
Commiter | henoheno |
$Id: spam_pickup.php,v 1.57 2007/08/26 14:22:16 henoheno Exp $
spam_uri_pickup_preprocess(): not to rawurldecode(), not to decode '%20'
spam_uri_pickup_preprocess(): Added some
@@ -1,5 +1,5 @@ | ||
1 | 1 | <?php |
2 | -// $Id: spam_pickup.php,v 1.2 2007-08-18 14:47:13 henoheno Exp $ | |
2 | +// $Id: spam_pickup.php,v 1.3 2007-08-26 14:27:02 henoheno Exp $ | |
3 | 3 | // Copyright (C) 2006-2007 PukiWiki Developers Team |
4 | 4 | // License: GPL v2 or (at your option) any later version |
5 | 5 | // |
@@ -23,7 +23,7 @@ function uri_pickup($string = '') | ||
23 | 23 | preg_match_all( |
24 | 24 | // scheme://userinfo@host:port/path/or/pathinfo/maybefile.and?query=string#fragment |
25 | 25 | // Refer RFC3986 (Regex below is not strict) |
26 | - '#(\b[a-z][a-z0-9.+-]{1,8}):[/\\\]+' . // 1: Scheme | |
26 | + '#(\b[a-z][a-z0-9.+-]{1,8}):[/\\\]+' . // 1: Scheme | |
27 | 27 | '(?:' . |
28 | 28 | '([^\s<>"\'\[\]/\#?@]*)' . // 2: Userinfo (Username) |
29 | 29 | '@)?' . |
@@ -31,7 +31,7 @@ function uri_pickup($string = '') | ||
31 | 31 | // 3: Host |
32 | 32 | '\[[0-9a-f:.]+\]' . '|' . // IPv6([colon-hex and dot]): RFC2732 |
33 | 33 | '(?:[0-9]{1,3}\.){3}[0-9]{1,3}' . '|' . // IPv4(dot-decimal): 001.22.3.44 |
34 | - '[a-z0-9_-][a-z0-9_.-]+[a-z0-9_-]' . // hostname(FQDN) : foo.example.org | |
34 | + '[a-z0-9_-][a-z0-9_.-]+[a-z0-9_-]' . // hostname(FQDN) : foo.example.org | |
35 | 35 | ')' . |
36 | 36 | '(?::([0-9]*))?' . // 4: Port |
37 | 37 | '((?:/+[^\s<>"\'\[\]/\#]+)*/+)?' . // 5: Directory path or path-info |
@@ -115,7 +115,6 @@ function uri_pickup_implode($uri = array()) | ||
115 | 115 | return implode('', $tmp); |
116 | 116 | } |
117 | 117 | |
118 | - | |
119 | 118 | // --------------------- |
120 | 119 | // URI normalization |
121 | 120 |
@@ -656,7 +655,7 @@ function _preg_replace_callback_domain_exposure($matches = array()) | ||
656 | 655 | return $result; |
657 | 656 | } |
658 | 657 | |
659 | -// Preprocess: rawurldecode() and adding space(s) and something | |
658 | +// Preprocess: minor-rawurldecode() and adding space(s) and something | |
660 | 659 | // to detect/count some URIs _if possible_ |
661 | 660 | // NOTE: It's maybe danger to var_dump(result). [e.g. 'javascript:'] |
662 | 661 | // [OK] http://victim.example.org/?site:nasty.example.org |
@@ -667,7 +666,20 @@ function spam_uri_pickup_preprocess($string = '', $method = array()) | ||
667 | 666 | { |
668 | 667 | if (! is_string($string)) return ''; |
669 | 668 | |
670 | - $string = spam_uri_removing_hocus_pocus(rawurldecode($string), $method); | |
669 | + // rawurldecode(), just to catch encoded 'http://path/to/file', not to change '%20' to ' ' | |
670 | + $string = strtr( | |
671 | + $string, | |
672 | + array( | |
673 | + '%3A' => ':', | |
674 | + '%3a' => ':', | |
675 | + '%2F' => '/', | |
676 | + '%2f' => '/', | |
677 | + '%5C' => '\\', | |
678 | + '%5c' => '\\', | |
679 | + ) | |
680 | + ); | |
681 | + | |
682 | + $string = spam_uri_removing_hocus_pocus($string, $method); | |
671 | 683 | //var_dump(htmlspecialchars($string)); |
672 | 684 | |
673 | 685 | // Domain exposure (simple) |
@@ -676,15 +688,16 @@ function spam_uri_pickup_preprocess($string = '', $method = array()) | ||
676 | 688 | $string = preg_replace( |
677 | 689 | '#h?ttp://' . |
678 | 690 | '(' . |
679 | - 'ime\.nu/' . '|' . // 2ch.net | |
680 | - 'ime\.st/' . '|' . // 2ch.net | |
691 | + 'ime\.(?:nu|st)/' . '|' . // 2ch.net | |
681 | 692 | 'link\.toolbot\.com/' . '|' . |
682 | 693 | 'urlx\.org/' . '|' . |
683 | 694 | 'big5.51job.com/gate/big5/' . '|' . |
684 | 695 | 'big5.china.com/gate/big5/' . '|' . |
696 | + 'big5.shippingchina.com:8080/' . '|' . | |
685 | 697 | 'big5.xinhuanet.com/gate/big5/' . '|' . |
686 | 698 | 'bhomiyo.com/en.xliterate/' . '|' . |
687 | - 'google.com/translate_c\?u=(?:http://)?' . | |
699 | + 'google.com/translate_c\?u=(?:http://)?' . '|' . | |
700 | + 'web.archive.org/web/2[^/]*/(?:http://)?' . | |
688 | 701 | ')' . |
689 | 702 | '([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' . // nasty.example.org |
690 | 703 | '#i', |
@@ -711,7 +724,7 @@ function spam_uri_pickup_preprocess($string = '', $method = array()) | ||
711 | 724 | ')' . |
712 | 725 | '/' . |
713 | 726 | '([a-z0-9?=&.%_/\'\\\+-]+)' . // 3:path/?query=foo+bar+ |
714 | - '\bsite:([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' . // 4:site:nasty.example.com | |
727 | + '(?:\b|%20)site:([a-z0-9.%_-]+\.[a-z0-9.%_-]+)' . // 4:site:nasty.example.com | |
715 | 728 | '()' . // 5:Preserve or remove? |
716 | 729 | '#i', |
717 | 730 | ), |