<?php
/**
* ReFilter: a dodgy RSS filter built with dodgy regular expressions.
*
* It processes a string of filters such as
*
*   (in:link:slashdot AND -has:enclosure) OR "taco singing"
*
* Which will filter out all RSS items except those which
*  1. have the text "slashdot" in the <link> element
*  AND
*  2. do not have an <enclosure> element
*  OR
*  3. have any element containing the text "taco singing".
*
* See documentation at http://re.rephrase.net/filter/ for more
* information.
*
* LICENSE
* Do whatever, just keep attribution and tell me if you
* use it for anything cool. (That seems unlikely, though,
* since it's messy as all hell and hardly efficient. :)
* Contact me if you need it under a real license.
*
* ReFilter
* version 0.8, 2005.07.08
* copyright 2005 Sam Angove <sam@rephrase.net>
*/

class ReFilter {
    var $_filters;
    var $_filterstring;
    var $_item_filters;
    var $_logic;
    var $_quoted;
    var $_seq = array();

    function ReFilter($filterstring = '', $rss = '') {
        if ($filterstring) {
            $this->set_filter($filterstring);
            if ($rss) return $this->filter_rss($rss);
        }
    }

    function set_filter($filterstring) {
        $this->_filterstring = $filterstring;
        $this->_process_filter_string($this->_filterstring);
    }

    /*
    * Filtering functions. Traverse a feed and remove items matching or not matching
    * a set of filters.
    */
    function filter_rss($xml) {
        if (count($this->_filters)) {
            $xml = preg_replace_callback("#<(item|entry)( .*?)?>(.*?)</\\1>#si", array(&$this, '_filter_item'), $xml);
            $title = htmlspecialchars($this->_filterstring);
            $xml = preg_replace("#<title>(.*?)</title>#", "<title>$1 | Filtered: $title</title>", $xml, 1);

            // For RSS 0.9whatever feeds that have an <rdf:Seq>, remove
            // items from it as well.
            if (count($this->_seq) && stristr($xml, 'rdf:Seq')) {
                foreach ($this->_seq as $li) {
                    $xml = str_replace("<rdf:li rdf:resource=\"$li\" />", '', $xml);
                }
            }

        }
        return $xml;
    }

    function _filter_item($matches) {
        $this->_item_filters = $this->_filters;
        preg_match_all("#<([a-zA-Z0-9:]*)(.*?)(>(.*?)</\\1>|/>)#s", $matches[3], $ematches, PREG_SET_ORDER);

        foreach ($ematches as $element) {
            // change (e.g.) "dc:date" to "dcdate"
            $tag = strtolower( str_replace(':', '', $element[1]) );

            $attributes = $this->_get_attributes($element[2]);

            $content = $element[4];
            $this->_filter_element($tag, $attributes, $content);
        }

        $filter_out = false;
        if ( !$this->_test_filter(count($this->_filters)-1) ) $filter_out = true;

        if (!$filter_out) {
            return $matches[0];
        } else {
            // If items have rdf:about attributes and are filtered out, we
            // probably need to remove them from the <rdf:Seq> as well.
            $itat = $this->_get_attributes($matches[2]);
            if (isset($itat['rdf:about'])) $this->_seq[] = $itat['rdf:about'];
        }
    }


    function _filter_element($element, $attributes, $content) {
        foreach ($this->_item_filters as $id => $filter) {
            if ($filter['mode'] == 'sub') {
                continue;
            } elseif ($filter['mode'] == 'has') {
                if ($filter['element'] == '_') {
                    if ($filter['search'] == $element) $this->_item_filters[$id]['match'] = true;
                } elseif ($filter['element'] == $element) {
                    if (isset($attributes[$filter['search']])) {
                        $this->_item_filters[$id]['match'] = true;
                    }
                }

            } elseif ($filter['element'] == $element || $filter['element'] == '_') {
                if ($filter['attribute']) {
                    if (isset($attributes[$filter['attribute']])) {
                        $result = $this->_filter($filter['mode'], $attributes[$filter['attribute']], $filter['search']);
                        if ($result) $this->_item_filters[$id]['match'] = true;
                    }
                } else {
                    $result = $this->_filter($filter['mode'], $content, $filter['search']);
                    if ($result) $this->_item_filters[$id]['match'] = true;
                }
            }
        }
    }

    function _filter($mode = 'in', $haystack, $needle) {
        if (!$haystack || !$needle) return false;
        switch($mode) {
            case 'start':
                return (substr($haystack, 0, strlen($needle)) == $needle);
            break;
            case 'end':
                $length = strlen($needle);
                return (substr($haystack, strlen($haystack)-$length, $length) == $needle);
            break;
            case 'in':
            default:
                return (stristr($haystack, $needle));
            break;
        }
    }

    /*
    * Filter string Processing functions
    *
    * Extract something useful from a string like "(d AND (e OR f) && (g OR h))".
    * The filters array contains search terms; logical relations have mode 'sub' and
    * reference other filters.
    *
    * E.g., from "(a AND b)" (simplified):
    *   [0] => Array (
    *       [search] => a
    *   )
    *   [1] => Array (
    *       [search] => b
    *   )
    *   [2] => Array (
    *       [mode] => sub
    *       [logic] => and
    *       [0] => 0
    *       [1] => 1
    *   )
    */
    function _process_filter_string($str) {
        // Before doing anything, replace "strings with spaces" with
        // md5sums of themselves -- sub 'em back in later. Chance of
        // collision is negligible (read: I don't care), and it makes
        // everything much, much simpler.
        preg_match_all("#-*([a-z0-9:]+:)*(\"[^\"]+\")#i", $str, $matches, PREG_SET_ORDER);
        foreach ($matches as $match) {
            $whole = $match[0];
            $key = md5($whole);
            $this->_quoted[$key] = $whole;
            $str = str_replace($whole, $key, $str);
        }

        // if parentheses are mismatched, try to balance them
        $open = substr_count($str, '(');
        $close = substr_count($str, ')');
        if ($open > $close) $str .= str_repeat(')', $open-$close);
        elseif ($open < $close) $str = str_repeat('(', $close-$open) . $str;

        // if there are no parentheses, but spaces, disambiguate
        if (!$open) $str = $this->_disambiguate($str);

        // recursive disambiguation
        while (strstr($str, '(')) {
            //echo "\n$str\n";
            $str = preg_replace_callback("#-*\(([^()]*)\)#", array(&$this, '_process_filter_string_callback'), $str);
            if (!strstr($str, '(')) {
                $str = $this->_disambiguate($str);
                //if (strstr($str, ' ')) $str = $this->_disambiguate($str);
                // no spaces either: single filter
                //else $this->_process_filter($str);
            }
        }
    }

    function _process_filter_string_callback($matches) {
        $positive = $this->_is_positive($matches[0]);
        return $this->_disambiguate($matches[1], $positive);
    }

    // Resolve double/triple/etc. negatives.
    //
    function _is_positive($str) {
        return ((strlen($str) - strlen(ltrim($str, '-'))) % 2 == 0) ? true : false;
    }


    // Adds extra parentheses to disambiguate expressions -- e.g.
    // turn "a AND b AND c" into "(a AND b) AND c".
    function _disambiguate($ambiguous, $positive = true) {
        $terms = preg_split("#[ ]*( |AND|OR|[|&]{1,2})[ ]*(?!\))#", $ambiguous, -1, PREG_SPLIT_DELIM_CAPTURE);

        if (count($terms) == 1)
            return $positive ? '~'.$this->_process_filter($ambiguous) : '~'.$this->_process_filter("-$ambiguous");
            //return $positive ? $ambiguous : "-$ambiguous";

        if (count($terms) == 3) {
            $one = $this->_process_filter($terms[0]);
            $two = $this->_process_filter($terms[2]);
            $key = count($this->_filters);
            switch($terms[1]) {
                case 'OR':
                case '|':
                case '||':
                    $logic = 'or';
                break;
                case ' ':
                case 'AND':
                case '&':
                case '&&':
                default:
                    $logic = 'and';
                break;
            }
            $this->_filters[$key] = array('mode' => 'sub', 'logic' => $logic, 'positive' => $positive, $one, $two);
            // Use something nobody's going to be entering to mark off references
            // to the logic array. Should ~ be changed for \x00 or something?
            return '~' . $key;
        }

        // If a longer string disambiguate with added parentheses.
        $last = count($terms);
        $out = '';
        foreach ($terms as $index => $term) {
            if ($index % 2 != 0) {
                $out .= " $term ";
            } elseif ($index == $last-1 && !$popen) {
                $out .= $term;
            } elseif (!$popen) {
                $out .= "($term";
                $popen = true;
            } else {
                $out .= "$term)";
                $popen = false;
            }
        }
        return "($out)";
    }


    function _process_filter($filterstring) {
        // Is the filter logic?
        if (preg_match('#-*~([0-9]+)#', $filterstring, $filter)) {
            $positive = $this->_is_positive($filter[0]);
            $a_pos = $this->_filters[$filter[1]]['positive'];
            $this->_filters[$filter[1]]['positive'] = $positive ? $a_pos : !$a_pos;
            return $filter[1];
        } else {
            // Sub in the quoted strings we removed earlier.
            if ($this->_quoted) {
                foreach ($this->_quoted as $key => $val) {
                    $filterstring = str_replace($key, $val, $filterstring);
                }
            }
            $reserved = array('in', 'start', 'end', 'has');
            preg_match("#-*([a-z0-9:]+:)*(\"[^\"]+\"|[^ ]+)#i", $filterstring, $filter);

            //positive or negative filter
            $positive = $this->_is_positive($filter[0]);

            // search term (e.g.: "chicken")
            $search = strtolower( trim($filter[2], '"') );

            // search in what? (e.g.: "in:title")
            $select = explode(':', trim($filter[1], ':'));

            if (!in_array($select[0], $reserved)) {
                $mode = 'in';
            } else {
                $mode = $select[0];
                array_shift($select);
            }

            // if searching in particular element or element attribute
            if (isset($select[0])) $element = $select[0];
            if (isset($select[1])) $attribute = $select[1];

            $key = ($element == '') ? '_' : $element;

            $this->_filters[] = array('element' => $key, 'positive' => $positive,
                                'mode' => $mode, 'search' => $search,
                                'attribute' => $attribute);
        }
        return count($this->_filters)-1;
    }

    function _test_logic($rule) {
        $logic = $rule['logic'];
        $return = false;
        if ($logic == 'and') {
            if ($this->_test_filter($rule[0]) && $this->_test_filter($rule[1])) $return = true;
        } elseif ($logic == 'or') {
            if ($this->_test_filter($rule[0]) || $this->_test_filter($rule[1])) $return = true;
        }
        return ($rule['positive']) ? $return : !$return;
    }


    function _test_filter($id) {
        $filter = $this->_item_filters[$id];
        if ($filter['mode'] == 'sub') {
            return $this->_test_logic($filter);
        } else {
            return $filter['positive'] ? $filter['match'] : !$filter['match'];
        }
    }

    // From 'a="b" c="d"' to Array{'a'=>'b', 'c'=>'d'};
    //
    function _get_attributes($attrstring) {
        $attributes = array();
        preg_match_all('#[\s]*[a-zA-Z0-9:\-]+[\s]*=[\s]*(["\']).*?\\1[\s]*#is', $attrstring, $matches, PREG_PATTERN_ORDER);
        if ($matches) {
            foreach ($matches[0] as $attribute) {
                $attribute = strtolower( trim($attribute) );
                $brute = explode('=', $attribute, 2);
                $attributes[$brute[0]] = trim($brute[1], '"');
            }
        }
        return $attributes;
    }
}
?>