C++ Primer学习笔记第六章tquery.cpp程序的剖析

类别:编程语言 点击:0 评论:0 推荐:

/*

tquery.c

我用的编译器是gnu c++ 和 vc2003,为了此程序能够执行需做以下的修改:

1           tquery.c à tquery.cpp

2           <iostream.h> à <iostream>, <fstream.h> à <fstream> , <stddef.h> à <cstddef>

3           增加 #include <iterator>

4           删除 allocator 和它前面的“,”号,注意在要在>>之间留一个空格,因为编译器不是神仙,它会把>>当成操作符

5           删除250行的diff_type,对于现在的编译器,它已经过时了

6           对于执行gnu c++ 执行:g++  -o  tquery.ext  tquery.cpp 〔ENTER〕

7           对于vc2003 执行:cl  tquery.cpp 〔ENTER〕

*/

#include <algorithm> //提供泛型算法的接口,比如copy

#include <string>

#include <vector>

#include <utility> //pair的接口

#include <map>

#include <set>

 

#include <iostream>

#include <fstream>

 

#include <cstddef> //定义了NULL,size_t等类型

#include <ctype.h> //为了大小写字母的处理

 

#include <iterator> //因为运用了ostream_iterator模板对象所以这是必须的,而原程序没有包含

 

typedef pair<short,short>        location;  //这样做完全是为了方便理解,没有太大的意义

typedef vector<location>  loc;

typedef vector<string>    text;

typedef pair<text*,loc*>         text_loc; //完整的类型是pair< vector<string>*, vector< pair<short,short> >* >

//如果是我写的话,肯定会把location,loc,text,text_loc排在一列,李破门先生为何要这么做?我想大概是个人的喜好问

//题,这样做有它的好处,就是不会把定义相互混淆

 

class TextQuery {

public:

       TextQuery() { memset( this, 0, sizeof( TextQuery )); } //我们知道memset是一个C语言的函数,这句把this指向的那块内存大小为sizeof(TextQuery)个单元初值设置为0,我对这个构造函数的理解是:因为此类的成员函数操纵的是一些string,所以把内存做这样的解释是为了提高string处理的效率,因为memset虽然返回的是void*指针,但是却被编译器解释成char*指针

 

       static void filter_elements( string felems ) { filt_elems = felems; }

 

       void query_text();

       void display_map_text();

       void display_text_locations();

       void doit() {

               retrieve_text();

               separate_words();

               filter_text();

               suffix_text();

               strip_caps();

               build_word_map();

       }

 

private:

       void retrieve_text();

       void separate_words();

       void filter_text();

       void strip_caps();

       void suffix_text();

       void suffix_s( string& );

       void build_word_map();

 

private:

       vector<string,allocator>             *lines_of_text;

       text_loc                               *text_locations;

       map<string,loc*,less<string>,allocator> *word_map;

       static string                          filt_elems;

};

 

string TextQuery::filt_elems( "\",.;:!?)(\\/" );

 

int main()

{

       TextQuery tq;

       tq.doit();

       tq.query_text();

        tq.display_map_text();

       return 0;

}

 

void

TextQuery::

retrieve_text()

{

       string file_name;

      

       cout << "please enter file name: ";

       cin  >> file_name;

 

       ifstream infile( file_name.c_str(), ios::in );

       if ( !infile ) {

              cerr << "oops! unable to open file "

                   << file_name << " -- bailing out!\n";

              exit( -1 );

       }

       else cout << "\n";

 

       lines_of_text = new vector<string,allocator>;

        string textline;

 

        while ( getline( infile, textline, '\n' ))

              lines_of_text->push_back( textline );

}

 

void

TextQuery::

separate_words()

{

        vector<string,allocator>   *words     = new vector<string,allocator>;

       vector<location,allocator> *locations = new vector<location,allocator>;

 

        for ( short line_pos = 0; line_pos < lines_of_text->size(); line_pos++ )

        {

              short  word_pos = 0;

                string textline = (*lines_of_text)[ line_pos ];

 

                string::size_type eol = textline.length();

                string::size_type pos = 0, prev_pos = 0;

 

                while (( pos = textline.find_first_of( ' ', pos )) != string::npos )

                {

                        words->push_back( textline.substr( prev_pos, pos - prev_pos ));

                     locations->push_back( make_pair( line_pos, word_pos ));

 

                        word_pos++; pos++; prev_pos = pos;

                }

 

                words->push_back( textline.substr( prev_pos, pos - prev_pos ));

              locations->push_back( make_pair( line_pos, word_pos ));

        }

      

        text_locations = new text_loc( words, locations );

}

 

void

TextQuery::

filter_text()

{

       if ( filt_elems.empty() )

            return;

 

       vector<string,allocator> *words = text_locations->first;

 

       vector<string,allocator>::iterator iter = words->begin();

       vector<string,allocator>::iterator iter_end = words->end();

 

       while ( iter != iter_end )

       {

                string::size_type pos = 0;

                while (( pos = (*iter).find_first_of( filt_elems, pos )) != string::npos )

                       (*iter).erase(pos,1);

              iter++;

       }

}

 

void

TextQuery::

suffix_text()

{

        vector<string,allocator> *words = text_locations->first;

 

        vector<string,allocator>::iterator iter = words->begin();

        vector<string,allocator>::iterator iter_end = words->end();

 

        while ( iter != iter_end )

        {

              // if 3 or less characters, let it be

              if ( (*iter).size() <= 3 ) { iter++; continue; }

              if ( (*iter)[ (*iter).size()-1 ] == 's' )

                     suffix_s( *iter );

 

              // additional suffix handling goes here ...

 

                iter++;

        }

}

 

void

TextQuery::

suffix_s( string &word )

{

        string::size_type spos = 0;

        string::size_type pos3 = word.size()-3;

 

        // "ous", "ss", "is", "ius"

        string suffixes( "oussisius" );

 

        if ( ! word.compare( pos3, 3, suffixes, spos, 3 ) ||

             ! word.compare( pos3, 3, suffixes, spos+6, 3 ) ||

             ! word.compare( pos3+1, 2, suffixes, spos+2, 2 ) ||

             ! word.compare( pos3+1, 2, suffixes, spos+4, 2 ))

                return;

 

        string ies( "ies" );

        if ( ! word.compare( pos3, 3, ies ))

        {

             word.replace( pos3, 3, 1, 'y' );

             return;

        }

 

        string ses( "ses" );

        if ( ! word.compare( pos3, 3, ses ))

        {

             word.erase( pos3+1, 2 );

             return;

        }

 

        // erase ending 's'

        word.erase( pos3+2 );

 

        // watch out for "'s"

        if ( word[ pos3+1 ] == '\'' )

             word.erase( pos3+1 );

}

 

void

TextQuery::

strip_caps()

{

        vector<string,allocator> *words = text_locations->first;

 

        vector<string,allocator>::iterator iter = words->begin();

        vector<string,allocator>::iterator iter_end = words->end();

 

       string caps( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" );

 

        while ( iter != iter_end ) {

                string::size_type pos = 0;

                while (( pos = (*iter).find_first_of( caps, pos )) != string::npos )

                       (*iter)[ pos ] = tolower( (*iter)[pos] );

                ++iter;

        }

}

 

 

void

TextQuery::

build_word_map()

{

     word_map = new map< string, loc*, less<string>, allocator >;

 

     typedef map<string,loc*,less<string>,allocator>::value_type value_type;

     typedef set<string,less<string>,allocator>::difference_type diff_type;

 

     set<string,less<string>,allocator> exclusion_set;

 

     ifstream infile( "exclusion_set" );

     if ( !infile )

     {

          static string default_excluded_words[25] = {

            "the","and","but","that","then","are","been",

            "can","can't","cannot","could","did","for",

            "had","have","him","his","her","its","into",

            "were","which","when","with","would"

          };

 

          cerr << "warning! unable to open word exclusion file! -- "

               << "using default set\n";

 

          copy( default_excluded_words, default_excluded_words+25, inserter( exclusion_set, exclusion_set.begin() ));

     }

     else {

          istream_iterator< string, diff_type > input_set( infile ), eos;

          copy( input_set, eos, inserter( exclusion_set, exclusion_set.begin() ));

     }

 

     // iterate through the the words, entering the key/pair

 

     vector<string,allocator>   *text_words = text_locations->first;

     vector<location,allocator> *text_locs  = text_locations->second;

 

     register int elem_cnt = text_words->size();

     for ( int ix = 0; ix < elem_cnt; ++ix )

         {

                string textword = ( *text_words )[ ix ];

 

                // exclusion strategies

                // less than 3 character or in exclusion set

                if ( textword.size() < 3 ||

                     exclusion_set.count( textword ))

                        continue;

 

                if ( ! word_map->count((*text_words)[ix] ))

                {  // not present, add it:

                   loc *ploc = new vector<location,allocator>;

                   ploc->push_back( (*text_locs)[ix] );

                   word_map->insert( value_type( (*text_words)[ix], ploc ));

                }

                else (*word_map)[(*text_words)[ix]]->push_back( (*text_locs)[ix] );

         }

}

 

void

TextQuery::

query_text()

{

       string query_text;

   

       do {

           cout << "enter a word against which to search the text.\n"

               << "to quit, enter a single character ==>  ";

          cin  >> query_text;

 

          if ( query_text.size() < 2 ) break;

 

           string caps( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" );

           string::size_type pos = 0;

           while (( pos = query_text.find_first_of( caps, pos )) != string::npos )

                    query_text[ pos ] = tolower( query_text[pos] );

 

           // if we index into map, query_text is entered, if absent

           // not at all what we should wish for ...

 

          if ( !word_map->count( query_text )) {

              cout << "\nSorry. There are no entries for "

                   << query_text << ".\n\n";

              continue;

           }

 

          loc *ploc = (*word_map)[ query_text ];

 

           set<short,less<short>,allocator> occurrence_lines; 

           loc::iterator liter = ploc->begin(), liter_end = ploc->end();

 

            while ( liter != liter_end ) {

                  occurrence_lines.insert(occurrence_lines.end(), (*liter).first);

                    ++liter;

            }

 

           register int size = occurrence_lines.size();

           cout << "\n" << query_text

               << " occurs " << size

               << (size == 1 ? " time:" : " times:")

               << "\n\n";

 

           set<short,less<short>,allocator>::iterator it=occurrence_lines.begin();

           for ( ; it != occurrence_lines.end(); ++it ) {

                int line = *it;

 

                cout << "\t( line "

                     // don't confound user with text lines starting at 0 ...

                    << line + 1 << " ) "      

                     << (*lines_of_text)[line] << endl;

           }

 

           cout << endl;

        }

       while ( ! query_text.empty() );

        cout << "Ok, bye!\n";

}

 

void

TextQuery::

display_map_text()

{

        typedef map<string,loc*,less<string>,allocator> map_text;

        map_text::iterator iter = word_map->begin(), iter_end = word_map->end();

 

        while ( iter != iter_end ) {

                cout << "word: " << (*iter).first << " (";

 

                int           loc_cnt = 0;

                loc          *text_locs = (*iter).second;

                loc::iterator liter     = text_locs->begin(),

                              liter_end = text_locs->end();

 

                while ( liter != liter_end )

                {

                        if ( loc_cnt )

                             cout << ",";

                        else ++loc_cnt;

 

                        cout << "(" << (*liter).first

                             << "," << (*liter).second << ")";

 

                        ++liter;

                }

 

                cout << ")\n";

                ++iter;

        }

 

        cout << endl;

}

 

void

TextQuery::

display_text_locations()

{

        vector<string,allocator>   *text_words     = text_locations->first;

        vector<location,allocator> *text_locs      = text_locations->second;

 

        register int elem_cnt = text_words->size();

 

        if ( elem_cnt != text_locs->size() )

        {

             cerr << "oops! internal error: word and position vectors "

                  << "are of unequal size\n"

                  << "words: " << elem_cnt << " "

                  << "locs: "  << text_locs->size()

                  << " -- bailing out!\n";

             exit( -2 );

        }

 

        for ( int ix = 0; ix < elem_cnt; ix++ )

        {

                cout << "word: " << (*text_words)[ ix ] << "\t"

                     << "location: ("

                     << (*text_locs)[ix].first  << ","

                     << (*text_locs)[ix].second << ")"

                     << "\n";

        }

 

        cout << endl;

}

 

/*

sample input text:

------------------

 

Alice Emma has long flowing red hair. Her Daddy says

when the wind blows through her hair, it looks almost alive,

like a fiery bird in flight. A beautiful fiery bird, he tells her,

magical but untamed. "Daddy, shush, there is no such thing,"

she tells him, at the same time wanting him to tell her more.

Shyly, she asks, "I mean, Daddy, is there?"

 

---------------------

sample query session:

---------------------

 

please enter file name: alice_emma

 

warning! unable to open word exclusion file! -- using default set

 

enter a word against which to search the text.

to quit, enter a single character ==>  alice

 

alice occurs 1 time:

 

        ( line 1 ) Alice Emma has long flowing red hair. Her Daddy says

 

enter a word against which to search the text.

to quit, enter a single character ==>  daddy

 

daddy occurs 3 times:

 

        ( line 1 ) Alice Emma has long flowing red hair. Her Daddy says

        ( line 4 ) magical but untamed. "Daddy, shush, there is no such thing,"

        ( line 6 ) Shyly, she asks, "I mean, Daddy, is there?"

 

enter a word against which to search the text.

to quit, enter a single character ==>  phoenix

 

Sorry. There are no entries for phoenix.

 

enter a word against which to search the text.

to quit, enter a single character ==>  .

Ok, bye!

 

----------------------------------------------------------

sample text map after: (a) stripping out punctuation,

(b) eliminating semantically neutral words such as `the`,

(c) suffixing, so that fixes and fix become fix, and

(d) removal of capitalization

-----------------------------------------------------------

 

word: alice ((0,0))

word: alive ((1,10))

word: almost ((1,9))

word: ask ((5,2))

word: beautiful ((2,7))

word: bird ((2,3),(2,9))

word: blow ((1,3))

word: daddy ((0,8),(3,3),(5,5))

word: emma ((0,1))

word: fiery ((2,2),(2,8))

word: flight ((2,5))

word: flowing ((0,4))

word: hair ((0,6),(1,6))

word: has ((0,2))

word: like ((2,0))

word: long ((0,3))

word: look ((1,8))

word: magical ((3,0))

word: mean ((5,4))

word: more ((4,12))

word: red ((0,5))

word: same ((4,5))

word: say ((0,9))

word: she ((4,0),(5,1))

word: shush ((3,4))

word: shyly ((5,0))

word: such ((3,8))

word: tell ((2,11),(4,1),(4,10))

word: there ((3,5),(5,7))

word: thing ((3,9))

word: through ((1,4))

word: time ((4,6))

word: untamed ((3,2))

word: wanting ((4,7))

word: wind ((1,2))

 

*/

 

本文地址:http://com.8s8s.com/it/it25530.htm