百度之星 2005年初赛题目四

yiranwuqing

浏览: 393204 次
性别:
来自: 天津

最近访客更多访客>>

一溜烟儿的跑

孤叶翔云

laou

Dickson-Lee

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

百度之星

第四题（共四题 100 分）：低频词过滤（ 40 分）

题目描述：请编写程序，从包含大量单词的文本中删除出现次数最少的单词。如果有多
个单词都出现最少的次数，则将这些单词都删除。

输入数据：程序读入已被命名为 corpus.txt 的一个大数据量的文本文件，该文件包含英
文单词和中文单词，词与词之间以一个或多个 whitespace 分隔。（为便于调试，您可下载
测试 corpus.txt 文件，实际运行时我们会使用不同内容的输入文件。）

输出数据：在标准输出上打印删除了 corpus.txt 中出现次数最少的单词之后的文本（
词与词保持原来的顺序，仍以空格分隔）。

评分标准：程序输出结果必须正确，内存使用越少越好，程序的执行时间越快越好。

思路：因为词汇的频率是一个未知数，在统计的过程中无法探知到哪些多哪些少，必须把所有的词汇进行统计才能统计出哪些词汇词频最低。故1、构造Hash表，对出现的词汇进行统计，统计词频。2、然后获取到词频最低的列表。3、对原文进行处理，剔除低频词汇。

GHash.h

#ifndef __GHASH_H_
#define __GHASH_H_
#include "GList.h"
#define HASHSIZE 512
typedef struct _Item
{
   char * key;
   char * value;
   unsigned int count;
   struct Item * next;
} Item;

void GHashInit();
Item *  HashInSert(char * key,char * value);
Item *  HashSearch(char * key);
int  HashIncrease(char * key);
int  HashRemove(char * key);
void GetMinList(List* glist);
void FreeGHash();
void PrintGHash();

#endif

GHash.c

#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include "GHash.h"
#include "4lib.h"
//#include "GList.h"
static struct Item *hashtab[HASHSIZE];

static void freeItem(Item * item);
static unsigned int _Hash(char *key);
static unsigned int _ELFHash(char *str);

void GHashInit()
{
 	 int i=0;
 	 for(i=0; i<HASHSIZE; i++)
 	 {
	  	hashtab[i]=NULL;
     }
}

Item *  HashInSert(char * key,char * value)
{
 	 Item * np;
 	 unsigned int hashval;
 	 if((np=HashSearch(key))==NULL)
 	 {
         np=(Item *)malloc(sizeof(Item));
         if(NULL==np || NULL==(np->key = (char*)strdup(key))
		       || NULL ==(np->value = (char*)strdup(value)) )
         {
             return NULL;
         }
         hashval = _Hash(key);
         np->next = (struct Item *)hashtab[hashval];
         hashtab[hashval] = (struct Item *)np;
     }
     else
     {
	  	 free(np->value);
	  	 if((np->value=(char *)strdup(value))== NULL)
	  	 {
		    return NULL;
         }
 	 }
 	 return np;
}

Item *  HashSearch(char * key)
{
    Item  *np;

    for(np = (Item *)hashtab[_Hash(key)];
        np != NULL;
        np = np->next)
      if(strcmp(key,np->key) == 0)
           return np;
    return NULL;
}


int  HashIncrease(char * key)
{
 	 Item * np;
 	 unsigned int hashval;
 	 if((np=HashSearch(key))==NULL)
 	 {
         np=(Item *)malloc(sizeof(Item));
         if(NULL==np || NULL ==(np->key = (char *)strdup(key))
		    )
         {
             return -1;
         }
         hashval = _Hash(key);
         np->value=NULL;
         np->count=1;
         np->next = (struct Item *)hashtab[hashval];
         hashtab[hashval] = (struct Item *)np;
         return 1;
     }
     else
     {
	  	 np->count=np->count+1;
	  	 //free(np->value);
	  	 //if((np->value=strdup(value))== NULL)
	  	 //{
		 //   return NULL;
         //}
         return np->count;
 	 }
 	 return np->count;
}

int  HashRemove(char * key)
{
}



void GetMinList(List* glist)
{
 	 GListInit(glist);
 	 GListEmptyList(glist);
 	 int curmin=100;
 	 int i=0;
 	 for(i=0; i<HASHSIZE; i++)
 	 {
	     if(hashtab[i]!=NULL)
	     {
		    Item * tmp=hashtab[i];
		    while(tmp!=NULL)
		    {
			 	if(tmp->count==curmin)
			 	{
				 	GListAddNode(tmp->key,glist);		  
		        }
		        else if(tmp->count<curmin)
		        {
				 	 GListEmptyList(glist);
				 	 GListAddNode(tmp->key,glist);	
				 	 curmin=tmp->count;
	            }
	            tmp=tmp->next;
	        }
		 }
     }
}

void FreeGHash()
{
 	 int i=0;
 	 for(i=0; i<HASHSIZE; i++)
 	 {
	     if(hashtab[i]!=NULL)
	     {
		    Item * tmp;
		    Item * deln;
		    for(tmp=hashtab[i]; tmp!=NULL; tmp=hashtab[i])
		    {
			     hashtab[i]=(struct Item *)(tmp->next);
			     freeItem(tmp);
			}
		 }
     }
}
void PrintGHash()
{
 	 printf("Print Hash:\n");
 	 int i=0;
 	 for(i=0; i<HASHSIZE; i++)
 	 {
	     if(hashtab[i] !=NULL )
	     {
		    printf("%d---",i);
		    Item * tmp;
		    for(tmp=hashtab[i]; tmp!=NULL; tmp=tmp->next)
		    {
			     printf("%s:%s:%d;",tmp->key,tmp->value,tmp->count);
			}
			printf("\n");
		 }
     }
}

static unsigned int _Hash(char *key)
{
    return _ELFHash(key)%HASHSIZE;
}

// ELF Hash Function
static unsigned int _ELFHash(char *str)
{
       unsigned int hash = 0;
       unsigned int x = 0;

       while (*str)
      {
           hash = (hash << 4) + (*str++);//hash左移4位，当前字符ASCII存入hash
           if ((x = hash & 0xF0000000L) != 0)
           {//如果最高的四位不为0，则说明字符多余7个，如果不处理，再加第九个字符时，第一个字符会被移出，因此要有如下处理。

           //该处理，如果对于字符串(a-z 或者A-Z)就会仅仅影响5-8位，否则会影响5-31位，因为C语言使用的算数移位
           hash ^= (x >> 24);
           //清空28-31位。
           hash &= ~x;
           }
       }

      //返回一个符号位为0的数，即丢弃最高位，以免函数外产生影响。(我们可以考虑，如果只有字符，符号位不可能为负)
     return (hash & 0x7FFFFFFF);

}

static void freeItem(Item * item)
{
   free(item->key);
   free(item->value);
   free(item);
   item=NULL;
}

GLish.h

#ifndef __GLIST_H_
#define __GLIST_H_
typedef struct _Node 
{
    char* data;
    struct _Node * next; 
} Node;
typedef struct _List
{
   int count; 
   Node * header;
} List;

void GListInit(List* glist);
void GListAddNode(char* data,List* glist);
void GListEmptyList(List * glist);
int GListGetCount(List* glist);
void GListPrint(List * glist);
#endif

GLish.c

#include <stdio.h>
#include <stdlib.h>
#include "GList.h"
#include "4lib.h"
//static List glist ;
void GListInit(List* glist)
{
 	 glist->count=0;
 	 glist->header=NULL;
}
void GListAddNode(char* data,List* glist)
{
 	Node * node = (Node *)malloc(sizeof(Node));
 	//node->data=data;
 	node->data=(char *)strdup(data);
 	node->next=glist->header;
 	glist->header=node;
 	glist->count=glist->count+1;
}
int GetCount(List* glist)
{
 	return glist->count;
}
void FreeNode(Node * node)
{
 	 free(node->data);
 	 free(node);
 	 node=NULL;
}
void GListEmptyList(List * glist)
{
 	 Node* tmp;
 	 while(glist->header!=NULL)
 	 {
        tmp=glist->header;
        glist->header=tmp->next;
        //free(tmp);
        FreeNode(tmp);
     }
     glist->count=0;
     glist->header=NULL;
}

void GListPrint(List * glist)
{
 	 printf("GList:\n");
 	 Node* tmp=glist->header;
 	 while(tmp!=NULL)
 	 {
        printf("%s ",tmp->data);
        tmp=tmp->next;
     }
     printf("\n");
}

int GListSearch(char * data,List* glist)
{
 	Node* tmp=glist->header;
 	 while(tmp!=NULL)
 	 {
	    if(strcmp(data,tmp->data)==0)
     	    return TRUE;
	    else
            tmp=tmp->next;
     }
     return FALSE;
}

4lib.h

#ifndef __4LIB_H_
#define __4LIB_H
#define TRUE 1
#define FALSE 0
char *strdup(const char *str);

#endif

4lib.c

char *strdup(const char *str)
{
    int n = strlen(str) + 1;
    char *dup = malloc(n);
    if(dup)
    {
        strcpy(dup, str);
    }
    return dup;
}

main.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "GList.h"
#include "GHash.h"
#include "4lib.h"
#define INPUT "4.input.txt"
#define READBUFSIZE 513
#define FREADSIZE ((READBUFSIZE-1))
#define KEEPBUFSIZE (READBUFSIZE*2)
char keepbuf[KEEPBUFSIZE];
extern int  HashIncrease(char * key);
void HandlerItem(const char *item)
{
 	//printf(" %s\n",item);
 	HashIncrease(item);
}

void HanlderBuf(const char * buf)
{
 	 if (buf==NULL)
 	     return;
 	 strncat(keepbuf,buf,strlen(buf));
 	 char *split=" \t\n";
 	 char *tmp,*next;
 	 char *first=strtok(keepbuf,split);
 	 //HandlerItem(first,out);
 	 tmp=first;
 	 while(tmp)
 	 {
          next=strtok(NULL,split);
          if(next==NULL)//end of string
          {
	          strncpy(keepbuf,tmp,strlen(tmp));
	          keepbuf[strlen(tmp)]='\0';
	          break;
	      }
	      else
	      {
		   	  HandlerItem(tmp);
	   	  }
          tmp=next;
          
     }
}

void HandlerItemout(char *item,List * glist)
{
    if(GListSearch(item,glist)==FALSE)
		  printf(" %s ",item);
 	//HashIncrease(item);
}

void HanlderBufout(char * buf,List * glist)
{
 	 if (buf==NULL)
 	     return;
 	 strncat(keepbuf,buf,strlen(buf));
 	 char *split=" \t\n";
 	 char *tmp,*next;
 	 char *first=strtok(keepbuf,split);
 	 //HandlerItem(first,out);
 	 tmp=first;
 	 while(tmp)
 	 {
          next=strtok(NULL,split);
          if(next==NULL)//end of string
          {
	          strncpy(keepbuf,tmp,strlen(tmp));
	          keepbuf[strlen(tmp)]='\0';
	          break;
	      }
	      else
	      {
		   	  HandlerItemout(tmp,glist);
	   	  }
          tmp=next;
          
     }
}


int main(int argc, char *argv[])
{
  FILE * f=fopen(INPUT,"r+");
  if(NULL==f)
  {
	 perror("Open file:");
  }
  
  GHashInit();
  
  char readbuf[READBUFSIZE];
  memset(keepbuf,'\0',KEEPBUFSIZE);
  memset(readbuf,'\0',READBUFSIZE);
  int readnum;
  while((readnum=fread(readbuf,sizeof(char),FREADSIZE,f))>0)
  {
      readbuf[readnum]='\0';
      //printf("%d\n",readnum);
      if(readnum<FREADSIZE)
      {
	      if(feof(f)!=0)//end file
	      {
	         HanlderBuf(readbuf);
	         break;
	      }
          else
          {
		   	  perror("Error!");
	   	  }
	  }
	  else if(readnum==FREADSIZE)
	  {
	   	   HanlderBuf(readbuf);
      }
  }
  
  if(strlen(keepbuf)>0)
    HandlerItem(keepbuf);
  fclose(f);
  PrintGHash();

  List glist;
  GetMinList(&glist);
  printf("list count:%d\n",GetCount(&glist));
  GListPrint(&glist);
  
  FreeGHash();
  
  //hanlde the input
  f=fopen(INPUT,"r+");
  if(NULL==f)
  {
	 perror("Open file:");
  }
  memset(keepbuf,'\0',KEEPBUFSIZE);
  memset(readbuf,'\0',READBUFSIZE);
  readnum;
  while((readnum=fread(readbuf,sizeof(char),FREADSIZE,f))>0)
  {
      readbuf[readnum]='\0';
      //printf("%d\n",readnum);
      if(readnum<FREADSIZE)
      {
	      if(feof(f)!=0)//end file
	      {
	         HanlderBufout(readbuf,&glist);
	         break;
	      }
          else
          {
		   	  perror("Error!");
	   	  }
	  }
	  else if(readnum==FREADSIZE)
	  {
	   	   HanlderBufout(readbuf,&glist);
      }
  }
  
  if(strlen(keepbuf)>0)
    HandlerItemout(keepbuf,&glist);
  fclose(f);
  
  GListEmptyList(&glist);
  system("PAUSE");	
  return 0;
}

该解决方案不包含的内容：

1、在对词汇进行统计的时，未记录词汇的位置，造成在提出低频词汇时需要再次比对，可以提升

2、仅适用于最低频的，无法针对词频范围。如果需要则可以修改为HashTable的冲突列表为顺序列表

3、无法获得每个低频词汇的出现的顺序

4、替换低频词汇效率不高

0
顶

0
踩

分享到：

百度之星 2005年初赛题目五 | conflicting types for

2012-04-07 10:02
浏览 1150
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论