本篇内容介绍了“分析数据库实现原理”的有关知识,在实际案例的操作过程中,不少人都会遇到这样的困境,接下来就让小编带领大家学习一下如何处理这些情况吧!希望大家仔细阅读,能够学有所成!
Hash连接,如内存足够,首先遍历内表创建Hash表,然后遍历外表,对连接键计算HashCode,如一致,则遍历Hash表中具有同一HashCode的链表,值一致,则返回该值。
如内存不够,可遍历两张表,使用同样的Hash函数把表拆分为N个Hash“分区”,遍历内表每一个Hash分区和外表相应的Hash分区,如找到与连接键值一致的数据,则返回该值。
详见代码注释.
#include <stdio.h> #include <stdlib.h> #include "hash_join.h" #define MAX_ELEMENTS 1024 //生成hash code static int generate_hashcode(int n) { return n % HASH_BUCKET; } //生成hash桶(写入到文件中,以文件的方式模拟) static int generate_bucket(FILE *file,char *tag) { printf("----------- generate_bucket ---------- \\n"); //数组 char buf[MAX_BYTES]; FILE *fd = NULL; for(;!feof(file);) { int x = read_int(file,buf); if(x == 0) break; int hashcode = generate_hashcode(x); char filename[30]; sprintf(filename,"/cygdrive/d/tmp/hash/%s_%d.csv",tag,hashcode); //printf("Hash code is %d,Bucket filename is %s.\\n",hashcode,filename); fd = fopen(filename,"a"); if(fd == NULL) { printf("Can not open file %s.\\n",filename); return 0; } //写入文件中 write_int(fd,x); fclose(fd); } return 1; } //把hash表加载到内存中,适用于内存足够的情况 //使用二维数组模拟Hash表,D1 : hash桶,D2 : 桶中的数据 static int load_hashtable(int ht[][MAX_ELEMENTS]) { printf("----------- load_hashtable ---------- \\n"); for(int i=0;i < HASH_BUCKET;i++) { //循环桶号 char filename[MAX_BYTES]; //读文件 sprintf(filename,"/cygdrive/d/tmp/hash/inner_%d.csv",i); FILE *fd = fopen(filename,"r"); if(fd == NULL){ //printf("Can not open file : %s\\n",filename); continue; } int j=0; char buf[MAX_BYTES]; for(;!feof(fd) && j < MAX_ELEMENTS;) { //把文件内容放到数组中 int x = read_int(fd,buf); ht[i][j++] = x; } fclose(fd); } return 1; } //使用内存创建hash表进行hash连接 static void hash_join_onmemory(FILE *outerfile,FILE *innerfile) { printf("----------- hash_join_onmemory ---------- \\n"); int ht[HASH_BUCKET][MAX_ELEMENTS]; char buffer[MAX_BYTES]; int flag = 0; //创建hash bucket文件 flag = generate_bucket(innerfile,"inner"); if(!flag) { printf("Can not generate bucket file!\\n"); return; } //加载到hash表中(二维数组模拟) flag = load_hashtable(ht); if(!flag) { printf("Can not load hash table!\\n"); return; } //遍历第二个文件,执行JOIN for(;!feof(outerfile);) { //读第二个文件,执行join int outer = read_int(outerfile,buffer); //计算hashcode int hashcode = generate_hashcode(outer); for(int i=0;i < MAX_ELEMENTS;i++) { //遍历hash桶中的数据,找到对应的数据 if(ht[hashcode][i] == outer) { printf("Found one,hash bucket is %d,value is : %d.\\n",hashcode,outer); } } } } //使用磁盘缓存进行hash连接 static void hash_join_ondisk(FILE *outerfile,FILE *innerfile) { printf("----------- hash_join_ondisk ---------- \\n"); char buffer[MAX_BYTES]; int flag = 0; //创建hash"桶"文件 flag = generate_bucket(innerfile,"inner"); if(!flag) { printf("Can not generate inner bucket file!\\n"); return; } flag = generate_bucket(outerfile,"outer"); if(!flag) { printf("Can not generate outer bucket file!\\n"); return; } //遍历hash值相同的文件,执行连接 for(int i=0;i < HASH_BUCKET;i++) { //从0号桶开始 char innerfname[MAX_BYTES]; char outerfname[MAX_BYTES]; //读文件 sprintf(innerfname,"/cygdrive/d/tmp/hash/%s_%d.csv","inner",i); sprintf(outerfname,"/cygdrive/d/tmp/hash/%s_%d.csv","outer",i); FILE *fd_inner = fopen(innerfname,"r"); if(fd_inner == NULL){ //printf("Can not open file : %s\\n",filename); continue; } FILE *fd_outer = fopen(outerfname,"r"); if(fd_outer == NULL) { continue; } for(;!feof(fd_outer);) { int v_out = read_int(fd_outer,buffer); if(v_out == 0) continue; for(;!feof(fd_inner);) { int v_in = read_int(fd_inner,buffer); if(v_in == 0) continue; if(v_out == v_in) { printf("Found one,hash bucket is %d,value is : %d.\\n",i,v_out); } } rewind(fd_inner); } } } //执行Hash连接 void hash_join(char *file1,char * file2,char *flag) { printf("----------- hash join ---------- \\n"); FILE *outerfile = fopen(file1,"r"); if(outerfile == NULL) { printf("Can not open file %s.\\n",file1); return; } //打开第二个文件 FILE *innerfile = fopen(file2,"r"); if(innerfile == NULL) { printf("Can not open file %s.\\n",file2); return; } //执行JOIN if(strcmp(flag,"memory") == 0) hash_join_onmemory(outerfile,innerfile); else hash_join_ondisk(outerfile,innerfile); //关闭 fclose(outerfile); fclose(innerfile); }
运行输出
$ cat file1.csv 1 2 3 4 5 1 234 2939 9002 20 $ cat file2.csv 11 20 3 40 55 50 234 33 90 1 $ /cygdrive/d/tmp/test.exe file1.csv file2.csv ------------- use memory ------------------ ----------- hash join ---------- ----------- hash_join_onmemory ---------- ----------- generate_bucket ---------- ----------- load_hashtable ---------- Found one,hash bucket is 1,value is : 1. Found one,hash bucket is 3,value is : 3. Found one,hash bucket is 1,value is : 1. Found one,hash bucket is 106,value is : 234. Found one,hash bucket is 20,value is : 20. ------------- use disk ------------------ ----------- hash join ---------- ----------- hash_join_ondisk ---------- ----------- generate_bucket ---------- ----------- generate_bucket ---------- Found one,hash bucket is 1,value is : 1. Found one,hash bucket is 1,value is : 1. Found one,hash bucket is 3,value is : 3. Found one,hash bucket is 20,value is : 20. Found one,hash bucket is 106,value is : 234.
“分析数据库实现原理”的内容就介绍到这里了,感谢大家的阅读。如果想了解更多行业相关的知识可以关注云搜网网站,小编将为大家输出更多高质量的实用文章!