Problem: 10 frequent words
Problem.
Find M most frequent words of length greater than 4. Text contains N words.
Word is sequence of alphabetical symbols (A-Z, a-z, А-Я, а-я).
- C code, not optimized, O ( N log N )
- C code (second version), not optimized, O( N log N )
- C++ code (using STL), not optimized, O( N log N )
- C++ code (using STL), optimized, O ( N log M )
- Ruby code
- The Problem: Solution with O(log N) time and O(M) memory
- The problem: M gretest numbers of N. Solution with O(log N) time and O(M) memory
C code, not optimized, O ( N log N )
#include <stdio.h> #include <memory.h> #include <malloc.h> #include <ctype.h> #define P 30011 char *dict[P]; int dict_size = 0; int freq[P]; int hash_index[P]; int isrusalpha(char c) { return isalpha(c) || (c >= 'А' && c <= 'Я') || (c >= 'а' && c <= 'я'); } /* Hash table. Хеш-таблица с открытой адресацией */ int get_id(const char *word) { unsigned int len = 0, h1 = 0, h2 = 0; const char *p = word; while ( *(p++) ) { h1 *= 17; h1 += 13*(*p); h2 *= 13; h2 += 17*(*p); len++; } h1 %= P; h2 %= (P-2); h2 += 1; while ( hash_index[h1] != -1 ) { if ( strcmp (word, dict[ hash_index[h1] ] ) == 0 ) { return hash_index[h1]; } h1 += h2; h1 %= P; } len++; dict[dict_size] = (char*) malloc ( len * sizeof(char) ); strcpy ( dict[dict_size], word ); hash_index[h1] = dict_size; return dict_size++; } int cmp(const void *a, const void *b) { int *aa = (int*)a; int *bb = (int*)b; return freq[*bb] - freq[*aa]; } int main () { char word[1024]; int *sorted_ids; int i = 0; int c; memset (hash_index, -1, sizeof(hash_index) ); memset (freq, 0, sizeof(freq) ); while ( (c = getchar ()) != EOF ) { if ( isrusalpha (c) ) { do { word[i++] = c; c = getchar(); } while ( isrusalpha(c) && i < 1023 ); word[i] = 0; if ( i >= 5 ) { freq[ get_id (word) ] += 1; } i = 0; } } sorted_ids = (int*) malloc ( dict_size * sizeof(int) ); for (i = 0; i < dict_size; i++) { sorted_ids[i] = i; } qsort ( sorted_ids, dict_size, sizeof(int), cmp ); for (i = 0; i < 10 && i < dict_size; i++ ) { int id = sorted_ids[i]; printf ("%20s %d\n", dict[id], freq[id]); } return 0; }
C code (second version), not optimized, O( N log N )
Works time O( N log N ), where N is text length.#include <stdio.h> #include <memory.h> #include <malloc.h> #include <ctype.h> #define P 30017 // should be prime typedef struct { int id; int freq; char *word; } dict_item_t; dict_item_t dict[P]; char dict_size = 0; int hash_index[P]; int isrusalpha(char c) { return isalpha(c) || (c >= 'А' && c <= 'Я') || (c >= 'а' && c <= 'я'); } /* Хеш-таблица с открытой адресацией */ int get_id(const char *word) { unsigned int len = 0, h1 = 0, h2 = 0; const char *p = word; while ( *(p++) ) { h1 *= 17; h1 += 13*(*p); h2 *= 13; h2 += 17*(*p); len++; } h1 %= P; h2 %= (P-2); h2 += 1; // h2 should be > 0 and < P while ( hash_index[h1] != -1 ) { if ( strcmp (word, dict[ hash_index[h1] ].word ) == 0 ) { return hash_index[h1]; } h1 += h2; h1 %= P; } dict[dict_size].word = (char*) malloc ( ++len * sizeof(char) ); strcpy ( dict[dict_size].word, word ); dict[dict_size].id = hash_index[h1] = dict_size; return dict_size++; } int cmp(const void *a, const void *b) { dict_item_t *aa = (dict_item_t*)a; dict_item_t *bb = (dict_item_t*)b; return aa->freq - bb->freq; } // read from stdin all non alphabetical void skip_delimiters() { char c; while( !isrusalpha(c = getchar()) && c != EOF ) { printf("skipping '%c'\n", c); }; if (c != EOF) ungetc( c, stdin ); } // reads word (alphabetical chars) from stdin and returns word length int read_word(char *word, int max_size) { char c; char *p = word; int len = 0; while( len++ < max_size && isrusalpha(c = getchar()) ) *(p++) = c; *p = 0; len--; printf( "READ: %s\n", word); return len; } int main () { char word[1024]; int *sorted_ids; int min_word_length = 5; int i, c; memset (hash_index, -1, sizeof(hash_index) ); while ( !feof(stdin) ){ skip_delimiters(); read_word(word, 1024); if ( strlen( word) >= min_word_length ) dict[ get_id(word) ].freq++; } qsort ( dict, dict_size, sizeof(dict_item_t), cmp ); for (i = 0; i < 10 && i < dict_size; i++ ) { printf ("%20s %d\n", dict[i].word, dict[i].id); } return 0; }
C++ code (using STL), not optimized, O( N log N )
Works time O( N log N ), where N is text length.#include <cstdio> #include <map> #include <vector> #include <string> #include <algorithm> using namespace std; map<string, int> dict; int main () { char word[1024]; int c, i; int answer_size = 10; int min_word_length = 5; while ( (c = getchar ()) != EOF ) { if ( isrusalpha (c) ) { do { word[i++] = c; c = getchar(); } while ( isrusalpha(c) && i < 1023 ); word[i] = 0; if ( i >= min_word_length ) { string s = word; if ( dict.find(s) == dict.end() ) dict[s] = 1; else dict[s] += 1; } i = 0; } } vector< pair<int, string> > ans; map<string, int>::iterator it; for ( it = dict.begin() ; it != dict.end(); it++ ) { ans.push_back( make_pair(-(*it).second, (*it).first) ); } sort( ans.begin(), ans.end() ); vector< pair<int, string> >::iterator it2; for ( it2 = ans.begin(); it2 != ans.end(); it2++, i++ ) { printf ("%20s %d\n", (*it2).second.c_str, -(*it2).first); } return 0; }
C++ code (using STL), optimized, O ( N log M )
Works time O ( N ), where N is text length.#include <cstdio> #include <hashmap> #include <set> #include <string> #include <algorithm> using namespace std; hashmap<string, int> dict; #define forall(a, it) typeof(a)::iterator it=a.begin(); for(; it != a.end(); it++ ) #define forall_r(a, it) typeof(a)::iterator it=a.rbegin(); for(; it != a.rend(); it++ ) int isrusalpha(char c) { return isalpha(c) || (c >= 'А' && c <= 'Я') || (c >= 'а' && c <= 'я'); } int main () { char word[1024]; int answer_size = 10; int min_word_length = 5; int c, i; while ( (c = getchar ()) != EOF ) { if ( isrusalpha (c) ) { i = 0; do { word[i++] = c; c = getchar(); } while ( isrusalpha(c) && i < 1023 ); // printf("%s\n", word); word[i] = 0; if ( i >= min_word_length ) { string s = word; if ( dict.find(s) == dict.end() ) dict[s] = 1; else dict[s] += 1; } } } set< pair<int, string> > ans; forall (dict, it) { ans.insert( make_pair((*it).second, (*it).first) ); if ( ans.size() > answer_size ) { ans.erase( ans.begin() ); } } forall_r (ans, it2)) { printf ("%20s %d\n", it2->second.c_str(), it2->first); } return 0; }
Ruby code
File.open("a.txt") do |file| file.read.scan(/[A-Za-zА-Яа-я]+/).select{|word| word.size > 4}.inject(Hash.new(0)){|f,word| f[word] += 1 f }.to_a.sort_by {|word, freq| -freq}[0..9].each {|word,freq| printf("%20s %5d\n", word, freq) } end
The Problem: Solution with O(log N) time and O(M) memory
- Take a look at STL's nth_element function. Write solution with O( N ) time and O(N) memory.
The problem: M gretest numbers of N. Solution with O(log N) time and O(M) memory
- Write program that finds M greatest of N given numbers using time O(N). Your program should use memory O(M).