Раздел «Язык Си».FindFrequentWords:

Problem: 10 frequent words

Problem. Find M most frequent words of length greater than 4. Text contains N words. Word is sequence of alphabetical symbols (A-Z, a-z, А-Я, а-я).

C code, not optimized, O ( N log N )

#include <stdio.h>
#include <memory.h>
#include <malloc.h>
#include <ctype.h>

#define P 30011

char *dict[P];
int dict_size = 0;
int freq[P];
int hash_index[P];


int 
isrusalpha(char c) {
  return isalpha(c) || (c >= 'А' && c <= 'Я') || (c >= 'а' && c <= 'я');
}

/* Hash table. Хеш-таблица с открытой адресацией */
int 
get_id(const char *word) {
    unsigned int len = 0, h1 = 0, h2 = 0;
    const char *p = word;
    while ( *(p++) ) {
        h1 *= 17; h1 += 13*(*p); 
        h2 *= 13; h2 += 17*(*p);
        len++;
    }
    
    h1 %= P;
    h2 %= (P-2); h2 += 1;

    while ( hash_index[h1] != -1  ) {
        if ( strcmp (word, dict[ hash_index[h1] ] ) == 0 ) {
            return hash_index[h1];
        } 
        h1 += h2; h1 %= P;
    }
    len++;
    dict[dict_size] = (char*) malloc ( len * sizeof(char) );
    strcpy ( dict[dict_size], word );
    hash_index[h1] = dict_size;
    return dict_size++;
}

int 
cmp(const void *a, const void *b) {
    int *aa = (int*)a;
    int *bb = (int*)b;
    return freq[*bb] - freq[*aa];
}

int 
main () {
    char word[1024];
    int *sorted_ids;
    int i = 0;
    int c;
    
    memset (hash_index, -1,  sizeof(hash_index) );
    memset (freq, 0, sizeof(freq) );
    
    while ( (c = getchar ()) != EOF ) {
        if ( isrusalpha (c) ) {
            do  {
                word[i++] = c;
                c = getchar();
            } while ( isrusalpha(c) && i < 1023  );
            word[i] = 0;
            if ( i >= 5 ) {
                freq[ get_id (word) ] += 1;
            }
            i = 0;
        }
    }
    
    sorted_ids = (int*) malloc ( dict_size * sizeof(int) );
    
    for (i = 0; i < dict_size; i++) {
        sorted_ids[i] = i;
    }
    
    qsort ( sorted_ids, dict_size, sizeof(int), cmp );
    
    for (i = 0; i < 10 && i < dict_size; i++ ) {
        int id = sorted_ids[i];
        printf ("%20s %d\n", dict[id], freq[id]);
    }
    
    return 0;
}

C code (second version), not optimized, O( N log N )

Works time O( N log N ), where N is text length.

#include <stdio.h>
#include <memory.h>
#include <malloc.h>
#include <ctype.h>

#define P 30017 // should be prime

typedef struct {
    int id;
    int freq;
    char *word;
} dict_item_t;

dict_item_t dict[P];
char dict_size = 0;
int hash_index[P];

int 
isrusalpha(char c) {
  return isalpha(c) || (c >= 'А' && c <= 'Я') || (c >= 'а' && c <= 'я');
}

/* Хеш-таблица с открытой адресацией */
int 
get_id(const char *word) {
    unsigned int len = 0, h1 = 0, h2 = 0;
    const char *p = word;
    while ( *(p++) ) {
        h1 *= 17; h1 += 13*(*p);
        h2 *= 13; h2 += 17*(*p);
        len++;
    }
    h1 %= P;
    h2 %= (P-2); h2 += 1; // h2 should be > 0 and < P

    while ( hash_index[h1] != -1  ) {
      if ( strcmp (word, dict[ hash_index[h1] ].word ) == 0 ) {
          return hash_index[h1];
      }
      h1 += h2; h1 %= P;
    }
    dict[dict_size].word = (char*) malloc ( ++len * sizeof(char) );
    strcpy ( dict[dict_size].word, word );
    dict[dict_size].id =  hash_index[h1] = dict_size;
    return dict_size++;
}

int 
cmp(const void *a, const void *b) {
    dict_item_t *aa = (dict_item_t*)a;
    dict_item_t *bb = (dict_item_t*)b;
    return aa->freq - bb->freq;
}

// read from stdin all non alphabetical
void 
skip_delimiters() {
    char c;
    while( !isrusalpha(c = getchar()) && c != EOF ) { printf("skipping '%c'\n", c); };
    if (c != EOF) ungetc( c, stdin );
}

// reads word (alphabetical chars) from stdin and returns word length
int 
read_word(char *word, int max_size) {
    char c; char *p = word;
    int len = 0;
    while( len++ < max_size && isrusalpha(c = getchar()) ) *(p++) = c;
    *p = 0; len--;
    printf( "READ: %s\n", word);
    return len;
}

int 
main () {
    char word[1024];
    int *sorted_ids;
    int min_word_length = 5;
    int i, c;
    memset (hash_index, -1,  sizeof(hash_index) );
    while ( !feof(stdin) ){
        skip_delimiters();
        read_word(word, 1024);
        if ( strlen( word) >= min_word_length ) dict[ get_id(word) ].freq++;
    }

    qsort ( dict, dict_size, sizeof(dict_item_t), cmp );

    for (i = 0; i < 10 && i < dict_size; i++ ) {
        printf ("%20s %d\n", dict[i].word, dict[i].id);
    }
    return 0;
}

C++ code (using STL), not optimized, O( N log N )

Works time O( N log N ), where N is text length.

#include <cstdio>
#include <map>
#include <vector>
#include <string>
#include <algorithm>

using namespace std;

map<string, int>  dict;

int main () {
    char word[1024];
    int c, i;
    int  answer_size = 10;
    int min_word_length = 5;
    
    while ( (c = getchar ()) != EOF ) {
        if ( isrusalpha (c) ) {
            do  {
                word[i++] = c;
                c = getchar();
            } while ( isrusalpha(c) && i < 1023  );
    
            word[i] = 0;
            if ( i >= min_word_length ) {
                string s = word;
                if ( dict.find(s) == dict.end()  ) 
                    dict[s] = 1;
                else
                    dict[s] += 1;
            }
            i = 0;
        }
    }

    vector< pair<int, string> > ans;
    map<string, int>::iterator it;

    for ( it = dict.begin() ; it != dict.end(); it++  ) {
        ans.push_back( make_pair(-(*it).second, (*it).first) );
    }

    sort( ans.begin(), ans.end() );
    
    vector< pair<int, string> >::iterator it2;
    
    for ( it2 = ans.begin(); it2 != ans.end(); it2++, i++ ) {
        printf ("%20s %d\n", (*it2).second.c_str, -(*it2).first);
    }
    
    return 0;
}

C++ code (using STL), optimized, O ( N log M )

Works time O ( N ), where N is text length.

#include <cstdio>
#include <hashmap>
#include <set>
#include <string>
#include <algorithm>

using namespace std;

hashmap<string, int>  dict;

#define forall(a, it) typeof(a)::iterator it=a.begin(); for(; it != a.end(); it++ )
#define forall_r(a, it) typeof(a)::iterator it=a.rbegin(); for(; it != a.rend(); it++ )

int isrusalpha(char c) {
  return isalpha(c) || 
  (c >= 'А' && c <= 'Я') || (c >= 'а' && c <= 'я');
}

int main () {
  char word[1024];
  int answer_size = 10;
  int min_word_length = 5;
  int c, i;
  
  while ( (c = getchar ()) != EOF ) {
    if ( isrusalpha (c) ) {
      i = 0;
      do  {
        word[i++] = c;
        c = getchar();
      } while ( isrusalpha(c) && i < 1023  );
      // printf("%s\n", word);
      word[i] = 0;
      if ( i >= min_word_length ) {
        string s = word;
        if ( dict.find(s) == dict.end()  ) 
          dict[s] = 1;
        else
          dict[s] += 1;
      }
    }
  }
  
  set< pair<int, string> > ans;
  forall (dict, it) {
    ans.insert( make_pair((*it).second, (*it).first) );
    if ( ans.size() > answer_size ) {
      ans.erase( ans.begin() );
    }
  }
  
  forall_r (ans, it2)) {
    printf ("%20s %d\n", it2->second.c_str(), it2->first);
  }
  
  return 0;
}

Ruby code

File.open("a.txt") do |file|
   file.read.scan(/[A-Za-zА-Яа-я]+/).select{|word| word.size > 4}.inject(Hash.new(0)){|f,word|
      f[word] += 1
      f
   }.to_a.sort_by {|word, freq| -freq}[0..9].each {|word,freq|
      printf("%20s %5d\n", word, freq)
   }
end

The Problem: Solution with O(log N) time and O(M) memory

The problem: M gretest numbers of N. Solution with O(log N) time and O(M) memory

-- ArtemVoroztsov - 24 Mar 2008