C++多进程英文单词统计_OC/C/C++

C++多进程英文单词统计
/*
 *
 *	Author   :  shenghan , UESTC
 *	
 *	Time     :  2012-11-02
 *
 *  Function :  A word frequency statistics tool, usage wordscan scan_dir  results
 *
 */
#include <iostream>
#include <fstream>
#include <vector>
#include <map>
#include <string>
#include <cstring>
#include <cstdlib>
#include <fcntl.h>
#include <unistd.h>
#include <dirent.h>
#include <sys/stat.h>
#include <errno.h>
#include <sys/wait.h>
#include <sys/ipc.h>
#include <sys/msg.h>
using namespace std;
DIR * rootdir;		//需要扫描的目录
#define SIZE 50		//单词的大小限制为50B
#define B_SIZE 4096      //每次从文件中
void readProcess(char * start_dir);     //读进程执行的函数
void statisticProcess();             //单词统计进程执行的函数
pid_t r_wait(int * stat_loc);         //回收所有的子进程
void doReadFile(char * filename);      //读进程中真正执行读文件的函数
void recordResult();                     //记录统计结果到文件中
bool isAlphabet(char alphabet);		//判断是否为ASCII英文字符
void getNextWord(char * b_buffer, int * bindex, char * result);   //从读取的数据块中获取一个英文单词
struct my_msg        //消息队列结构体
{
	int msg_type;              //消息类型
	char msg_text[SIZE];           //消息内容
};
typedef struct my_msg message;
int msgqueueid;          //消息队列ID
ofstream ofile;         //记录统计结构的文件
bool isHalfWord = false;   //判断获取的单词是否为半个单词    
char prehalfword[SIZE];    //存放获取的上半个单词
map<string,int> statisticMap;      //用于保存单词和单词数的Map(STL)
int main(int argc, char ** argv)
{
	struct dirent * dp;
	struct stat stat_buffer;
	vector<pid_t> readProID;    //保存读进程的进程ID 
	//pipe
	pid_t pid;
	if(argc != 3)
	{
		cerr<<"Usage wordscan <scan_dir>   <resultfilename>"<<endl;
		_exit(EXIT_FAILURE);
	}
	ofile.open(argv[2]);
	if(!ofile)
	{
		cerr<<"open result file error ! "<<endl;
		_exit(EXIT_FAILURE);
	}
	//cout<<"root dir :"<<argv[1]<<endl;
	if((rootdir = opendir(argv[1])) == NULL)
	{
		cerr<<"open dir "<<argv[1]<<" failed ! "<<endl;
		_exit(EXIT_FAILURE);
	}
	if((msgqueueid = msgget((key_t)12345,0666 | IPC_CREAT)) == -1) //创建并获取消息队列
	{
		cerr<<"error in create message queue ."<<endl;
		_exit(EXIT_FAILURE);
	}
	
	if((pid = fork()) == -1)  //创建单词统计进程
	{
		cerr<<"fork error !"<<endl;
		_exit(EXIT_FAILURE);
	}
	else if(pid == 0)
	{
		statisticProcess();   //单词统计进程执行的函数
	}
	else
	{
		chdir(argv[1]);
		while((dp = readdir(rootdir)) != NULL)  //遍历指定的扫描根目录
		{
		//	cout<<"cur : "<<dp->d_name<<endl;
			if(dp->d_name[0] == '.') //忽略本目录‘.’、父目录‘..’和所有隐藏的目录及文件
				continue;
			if((stat(dp->d_name,&stat_buffer)) == -1)   //获取扫描到的当前的文件或目录的属性
			{
				cout<<"error in function :stat , in scan "<<argv[1]<<endl;
				_exit(EXIT_FAILURE);
			}
			if(S_ISDIR(stat_buffer.st_mode))   //如果是目录
			{
				//cout<<"a subdir "<<endl;
				if((pid = fork()) == -1)  //对指定的根目录下的每一个子目录创建一个进程
				{
					cerr<<"fork error !"<<endl;
					_exit(EXIT_FAILURE);
				}
				else if(pid == 0){
					break;
				}
				else
				{
					readProID.push_back(pid);//将文件读取京城的进程好保存到vector中
				}
			}
			else if(S_ISREG(stat_buffer.st_mode))  //如果根目录中有普通文件，则直接读取普通文件
			{	
				//cout<<"a regular file "<<endl;
				doReadFile(dp->d_name);
			}
		}
		if(pid == 0)
		{ 
			readProcess(dp->d_name);   //读进程执行的函数
		}
		else
		{
			vector<pid_t>::iterator iter;
			for(iter = readProID.begin();iter != readProID.end();iter ++)
			{
				waitpid(*iter,NULL,0); //wait all the readProcess to end;
			}
			message endmsg;
			endmsg.msg_type = 2;
			strcpy(endmsg.msg_text,"end");
			if((msgsnd(msgqueueid,(void *)&endmsg,SIZE,0)) == -1)  //待所有的读进程都执行完毕后，发送一个结束消息到消息队列，结束消息的消息类型为2
			{
				cerr<<"message send error !"<<endl;
				_exit(EXIT_FAILURE);
			}
			while(r_wait(NULL) > 0);//wait for all the subprocess.
			if(msgctl(msgqueueid,IPC_RMID,NULL) == -1)//删除消息队列
			{
				cerr<<"msgctl(IPC_RMID) failed !"<<endl;
				_exit(EXIT_FAILURE);
			}
			closedir(rootdir);
			ofile.close();
			cout<<"OK !"<<endl;
			return 0;
		}
	}
}
pid_t r_wait(int * stat_loc)
{
	int revalue;
	while(((revalue = wait(stat_loc)) == -1) && (errno == EINTR));
	return revalue;
}
void statisticProcess()
{
	cout<<"statisticing ..."<<endl;
	message currentmsg;
	//bool RUN = true;
	if((msgrcv(msgqueueid,(void *)&currentmsg,SIZE,0,0)) == -1)//从消息队列获取第一个消息
	{
		cerr<<"message receive error !"<<endl;
		_exit(EXIT_FAILURE);
		//RUN = false;
	}
	map<string,int>::iterator iter;   //遍历Map(STL)的跌代器
	int tempnum = 0;
	while(currentmsg.msg_type != 2)   //如果接受的消息不是结束类型的消息，则继续循环执行一下代码
	{
		/*
		*在Map(STL)中查找是否有当前接受的单词，如果有的话，则将单词数加1,如果没有的话，则将该单词插入到Map(STL)中，并设置单词数为1
		*/
		if((iter = statisticMap.find(currentmsg.msg_text)) != statisticMap.end())
		{
			tempnum = (*iter).second;
			tempnum ++;
			statisticMap[currentmsg.msg_text] = tempnum;
		}
		else
		{
			statisticMap[currentmsg.msg_text] = 1;
		}
		//cout<<currentmsg.msg_text<<"  type :"<<currentmsg.msg_type<<endl;
		if((msgrcv(msgqueueid,(void *)&currentmsg,SIZE,0,0)) == -1)//从消息队列中接受下一个消息
		{
			cerr<<"message receive error !"<<endl;
			_exit(EXIT_FAILURE);
		//	RUN = false;
		}
	}
	recordResult();
}
void recordResult()
{
	map<string,int>::iterator iter;
	cout<<"writing  result to file ..."<<endl;
	for(iter = statisticMap.begin();iter != statisticMap.end();iter++) //遍历Map(STL)，并将其中的单词统计数据保存到文件中
	{
		ofile<<(*iter).first<<"  "<<(*iter).second<<endl;//记录的格式为<单词> <单数数>
	}
	cout<<"finished writing !"<<endl;
}
void doReadFile(char * filename)
{
	char  b_buffer[B_SIZE+1]; //b_buffer[B_SIZE]='\0'
	//cout<<"begin read ! "<<endl;
	cout<<"-> "<<endl;
	int fd;
	int rbytenum;
	message currentmsg;
	char curreadword[SIZE];
	int begin_index = 0;
	fd = open(filename,O_RDONLY);
	if(fd == -1)
	{
		cerr<<"open file "<<filename<<" error !"<<endl;
		_exit(EXIT_FAILURE);
	}
	do
	{
		rbytenum = read(fd,b_buffer,B_SIZE);//从文件中读取B_SIZE大小的数据块，并保存到b_buffer中
		b_buffer[rbytenum]='\0';//在读取的数据快结尾插上一个结束符
		//printf("%s\n\n\n\n",b_buffer );
		begin_index = 0;
		while(begin_index != rbytenum)  //提取上面获取的数据块中的所有单词
		{
			getNextWord(b_buffer,&begin_index,curreadword);  //在b_buffer中，从begin_index处开始提取一个单词，并保存到curreadword中
			if(isHalfWord)  //去过获取的是半个单词的话，说明当前数据块已经遍历到结尾，跳出本次循环，并从新读取一个数据块
				break;
			if(curreadword[0] != '\0')//如果当前提取的单词不为空
			{
				currentmsg.msg_type = 1;
				strcpy(currentmsg.msg_text,curreadword);
				if((msgsnd(msgqueueid,(void *)&currentmsg,SIZE,0)) == -1) //将当前提取的单词发送到消息队列中
				{
					cerr<<"message send error !"<<endl;
					_exit(EXIT_FAILURE);
				}
			}
		}
		
	}while(rbytenum == B_SIZE);//一直读取文件中的内容，直到结束
	
	//cout<<"read finished !"<<endl;
	close(fd);
}
void readProcess(char * start_dir)
{
	//cout<<"pid : "<<getpid()<<endl;
	//cout<<" a dir : "<<start_dir<<endl;
	DIR * curdir;
	struct dirent * curdp;
	struct stat curstat_buffer;
	if((curdir = opendir(start_dir)) == NULL)
	{
		cerr<<"open dir "<<start_dir<<" failed ! "<<endl;
		_exit(EXIT_FAILURE);
	}
	chdir(start_dir);     //切换当前的工作目录      
	char * current_dir = get_current_dir_name();//获取当前工作目录，并保存，方便最后返回到该目录
	//cout<<"cur dir "<<current_dir<<endl;
	while((curdp = readdir(curdir)) != NULL)  //遍历当前子目录中的文件
	{
	//	cout<<"cur :"<<curdp->d_name<<endl;
		if(curdp->d_name[0] == '.')   //忽略本目录‘.’、父目录‘..’和所有隐藏的目录及文件
			continue;
		if((stat(curdp->d_name,&curstat_buffer)) == -1)  //获取扫描到的当前的文件或目录的属性
		{
			cout<<"error in funtion stat in scan "<<start_dir<<endl;
			_exit(EXIT_FAILURE);
		}
		if(S_ISDIR(curstat_buffer.st_mode))  //如果扫描到目录，则递归遍历该子目录
		{
			readProcess(curdp->d_name);
			chdir(current_dir);   //因为在递归遍历子目录的过程中改变了当前工作目录，所有要根据前面保存的工作目录返回到递归子目录之前的工作目录
		}
		else if(S_ISREG(curstat_buffer.st_mode))//如果扫描到普通文件则直接读取该文件中的单词
		{
			doReadFile(curdp->d_name);
		}
	}
	//cout<<"cur dir : "<<get_current_dir_name()<<"  finished"<<endl;
}
bool isAlphabet(char  ch)
{
	if((ch>=65&&ch<=90)||(ch>=97&&ch<=122))  //判断是否为ASCII字符
       		return true;
    	else
            	return false;
}
void getNextWord(char * buffer, int * bindex,char * result)
{
	char ch;
	bool begin = true;
	int index = 0;//用于存放提取的单词的索引
	result[index] = '\0';
	if(isHalfWord)//如果上次提取的是半个单词，则继续从新获得的数据块中提取下一半单词
	{
		strcpy(result,prehalfword);
		isHalfWord = false; 
		if((buffer[*bindex] == ' '|| buffer[*bindex] == '\n' || buffer[*bindex] == '\r'))
		//如果新获取的数据块的第一个字节是非显示字符，则表明上次获取的半个单词其实是一个整的单词
		{
			return;
		}
		else
			index = strlen(prehalfword);//如果第一个字节是显示字符，则继续获取下一半单词
	}
	while(buffer[*bindex] != '\0')  //如果当前数据块没有到达结尾
	{
		ch = buffer[(*bindex)++];//获取当前数据块的索引所指定的一个字符，并使数据块的索引加1
		if((ch == ' ' || ch =='\n' || ch == '\r') && begin) //如果获取的是非显示字符，并且还未获取单词的第一个字符，则继续获取数据块的下一个字符
			continue;
		/*
		*如果获取的是非显示字符，并且已经获取长度大于0的单词，则说明已经获取的单词字符构成一个整的单词，跳出循环，并返回以获取的单词	
		*/
		else if((ch == ' ' || ch == '\n' || ch == '\r') && !begin)
			break;
		else  
		{
			if(isAlphabet(ch))//如果该显示字符是ASCII字符，则记录该字符为当前单词的一部分
			{
				result[index++] = ch;
				if(begin)
					begin = false;
				if(index == (SIZE -1))  //如果当前单词的长度超过指定的单词长度SIZE，则忽略该长单词的其余部分
				{
					while(buffer[*bindex] != '\0')
					{
						ch = buffer[(*bindex)++] ;
						if(ch == ' ' || ch == '\n' || ch == '\r')
							break;
					}  //ignore other char 
					break;
				}
					
			}
			else
			{
				if(ch == ',' || ch =='.')//如果该显示字符是','或'.'则说明，该字符也是一个单词分割符，已经获取的单词符号可以组成一个整的单词
				{
					break;
				}
				else
				{       //如果该显示字符不是单词分割符，则忽略以后的所有字符，直到遇到一个单词分隔符
					while(buffer[*bindex] != '\0')
					{
						ch = buffer[(*bindex)++]  ;
						if(ch == ' ' || ch == '\n' || ch == '\r'||ch == ','|| ch == '.')
							break;
					}
					//从新初始化各个变量
					begin  = true;
					index = 0;
					result[0] = '\0';
					continue;
				}
				
			}
				
		}
	}
	result[index] = '\0';//将向单词字符插入一个结束符
	ch = buffer[*bindex -1];
	if((buffer[*bindex] == '\0') && ch !=' '&&ch != '\n'&&ch != '\r'&&ch!=','&&ch!='.')
	{	//如果该数据块已经扫描完成，并且最后一个字符不是单词分隔符，则已经获取的单词很可能是一半个单词
		isHalfWord = true;
	}
	
}