浏览 4192 次
锁定老帖子 主题:c正则的一个应用
精华帖 (0) :: 良好帖 (0) :: 新手帖 (4) :: 隐藏帖 (0)
|
|
---|---|
作者 | 正文 |
发表时间:2011-05-17
最后修改:2011-07-08
所以在编译后运行前,请先保证tor正在运行中,否则无法下载到代理。一般情况下一次能下载到2000个左右的代理。 #include <stdio.h> #include <string.h> #include <stdlib.h> #include <sys/types.h> #include <sys/socket.h> #include <netinet/in.h> #include <arpa/inet.h> #include <unistd.h> #include <netdb.h> #include <regex.h> #define NM 10 static int proxy_save_count = 0; static int reg_num; static unsigned short through_proxy_port = 8118; static char code[10]; static char *through_proxy_ip = "127.0.0.1"; static regex_t **pregs; void die(int i, char *message, char *str) { if(str != NULL){ printf(message, str); exit(i); }else{ printf(message); exit(i); } } int write_all(int sockfd, char *buf, int *len) { int total = 0, bytesleft = *len, n; while(total < *len) { n = write(sockfd, buf + total, bytesleft); if (n == -1) break; total += n; bytesleft -= n; } *len = total; return n == -1 ? -1 : 0; } void to_lower(char *p) { while(*p){ if(isalpha(*p) && isupper(*p)) *p = tolower(*p); p++; } } int parse_http_header(int sockfd, long *bytes) { char buf[100], key[30], value[70], ch, *pb = buf; int n, i = 0, j = 0; memset(key, 0, sizeof(key)); memset(value, 0, sizeof(value)); while((n = read(sockfd, &ch, 1)) == 1){ if(ch == '\n'){ if(i == 0) break; i = 0; }else if(ch != '\r'){ pb[i++] = ch; }else{ pb[i] = '\0'; //printf("%s\n", buf); if(j++ == 0){ if(!strstr(buf, "HTTP/1.") || !strstr(buf, "200")){ printf("%s\n", buf); return -1; } }else{ if(sscanf(buf, "%[^:]: %[^\n]", key, value ) == 2){ to_lower(key); if(strcmp("content-length", key) == 0){ *bytes = atol(value); } } } } } if(n == -1){ fprintf(stderr, "read() error\n"); return -2; } return 0; } int with_content_length(int sockfd, long bytes, char **recv) { char buf[1024]; int n, len = sizeof(buf); if((*recv = (char *)malloc(bytes+1)) == NULL) die(-1, "Memory allocation failed!\n", NULL); memset(*recv, 0, bytes+1); while((n = read(sockfd, buf, len-1)) > 0){ buf[n] = '\0'; strcat(*recv, buf); } if(n == -1){ fprintf(stderr, "read() error\n"); return 1; } close(sockfd); return 0; } int without_content_length(int sockfd, char **recv) { char buf[1024]; int n, len = sizeof(buf); int i = 0, total = 0; while((n = read(sockfd, buf, len-1)) > 0){ total += n; buf[n] = '\0'; if(i++ == 0){ if((*recv = (char *)malloc(total+1)) == NULL) die(-1, "Memory allocation failed!\n", NULL); strcpy(*recv, buf); }else{ if((*recv = (char *)realloc(*recv, total+1)) == NULL) die(-1, "Memory reallocation failed!\n", NULL); strcat(*recv, buf); } } if(n == -1){ fprintf(stderr, "read() error\n"); return 1; } close(sockfd); return 0; } int get_http_body(char *send, char **recv) { int sockfd, n, i = 0, len = strlen(send); struct sockaddr_in sa; struct hostent *he; long bytes = 0; if((sockfd = socket(AF_INET, SOCK_STREAM, 0)) == -1){ fprintf(stderr, "socket() falied\n"); return 1; } sa.sin_family = AF_INET; sa.sin_addr.s_addr = inet_addr(through_proxy_ip); sa.sin_port = htons(through_proxy_port); if(connect(sockfd, (struct sockaddr *)&sa, sizeof(sa)) == -1){ fprintf(stderr, "connect() failed\n"); return 2; } if(write_all(sockfd, send, &len) == -1){ fprintf(stderr, "write() error\n"); return 3; } if((parse_http_header(sockfd, &bytes)) < 0) return 4; if(bytes > 0){ if(with_content_length(sockfd, bytes, recv)) return 5; }else{ if(without_content_length(sockfd, recv)) return 6; } return 0; } int sub_string(int start, int end, char *src, char **dst) { int i = start, j = 0; if((*dst = (char *)malloc(sizeof(char)*(end-start+1))) == NULL) die(-1, "Memory allocation failed!\n", NULL); while(i < end) (*dst)[j++] = src[i++]; (*dst)[j] ='\0'; return 0 ; } int get_index(char *p) { int i; for(i = 0; i < 10; i++){ if(code[i] == *p) break; } return i; } void get_code(char *p) { int i, j; memset(code, 0, sizeof(code)); for(i = 0; i < 39; i += 4){ j = atoi(p + i + 2); code[j]= p[i]; } } void print_port(char *port, FILE *fp) { char ptr[11], *ch = NULL, delims[] = "+"; int i; memset(ptr, 0, sizeof(ptr)); strcpy(ptr, port); ch = strtok(ptr, delims); while(ch != NULL){ i = get_index(ch); if(i < 10) fprintf(fp, "%d", i); else printf("Can't not decode port\n"); ch = strtok(NULL, delims); } fprintf(fp, "\n"); } int parse_http_body(char *str, FILE *fp, int eflags, int i, int j, int n, int k, int flag) { char *ip = NULL, *port = NULL, ch = *str; regmatch_t pm[NM]; const size_t nm = NM; if(n > NM){ while(*str && regexec(pregs[k], str, nm, pm, eflags) == 0){ sub_string(pm[i].rm_so, pm[i].rm_eo, str, &ip); if(flag){ get_code(ip); *str = 0; }else{ proxy_save_count++; fprintf(fp, "%s\n", ip); str = &str[pm[j].rm_eo]; } free(ip); } if(flag) *str = ch; }else{ while(*str && regexec(pregs[k], str, nm, pm, eflags) == 0){ sub_string(pm[i].rm_so, pm[i].rm_eo, str, &ip); sub_string(pm[n].rm_so, pm[n].rm_eo, str, &port); if(flag){ proxy_save_count++; fprintf(fp, "%s:", ip); print_port(port, fp); }else{ proxy_save_count++; fprintf(fp, "%s:%s\n", ip, port); } free(ip); free(port); str = &str[pm[j].rm_eo]; } } return 0; } int print_proxys(char *url, int *keys, FILE *fp, int eflags) { char host[30], send[512], *data = NULL; int i, j, n; memset(host, 0, sizeof(host)); memset(send, 0, sizeof(send)); if(sscanf(url, "http://%[^/]", host) != 1){ printf("Can't not parse url\n"); return 1; } sprintf(send, "GET %s HTTP/1.1\r\nHost: %s\r\nUser-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: deflate\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nKeep-Alive: 115\r\nProxy-Connection: keep-alive\r\n\r\n", url, host); //printf("%s\n", send); if(get_http_body(send, &data) != 0) return 2; if(keys[1] < 0){ if(keys[0] > reg_num-1 || keys[0] < 0) return 2; parse_http_body(data, fp, eflags, 0, 2, 11, keys[0], 0); }else{ if(keys[0] > reg_num-1 || keys[0] < 0 || keys[1] > reg_num-1 || keys[1] < 0) return 3; parse_http_body(data, fp, eflags, 0, 1, 11, keys[0], 1); parse_http_body(data, fp, eflags, 1, 3, 3, keys[1], 1) ; } free(data); return 0; } int compile_regexs(char **regexs, int cflags) { int i; if((pregs = (regex_t **)malloc(sizeof(regex_t *)*reg_num)) == NULL) die(-1, "Memory allocation failed!\n", NULL); for(i = 0; i < reg_num; i++){ if((pregs[i] = (regex_t *)malloc(sizeof(regex_t))) == NULL) die(-1, "Memory allocation failed!\n", NULL); if(regcomp(pregs[i], regexs[i], cflags) != 0){ fprintf(stderr, "regcomp() failed\n"); return (i+1); } } return 0; } void free_regs(void) { int i; for(i = 0; i < reg_num; i++) regfree(pregs[i]); free(pregs); } int main() { char *file = "proxy.txt"; FILE *fp = NULL; int num, i, keys[2]; int eflags = 0, cflags = REG_EXTENDED|REG_NEWLINE; if((fp = fopen(file, "w")) == NULL) die(-1, "Failed to create the file %s\n", file); char *regexs[] = {"([0-9]{1,3}\\.){3}([0-9]{1,3}:[0-9]{1,5})", "([a-z]=[0-9];){10}", "(([0-9]{1,3}\\.){3}[0-9]{1,3})[^\\+]+((\\+[a-z]){1,5})", }; reg_num = sizeof(regexs)/sizeof(char *); if(compile_regexs(regexs, cflags) != 0) return 1; char *urls[] = {"http://www.samair.ru/proxy/type-01.htm", "http://www.samair.ru/proxy/type-02.htm", "http://www.samair.ru/proxy/type-03.htm", "http://www.samair.ru/proxy/type-04.htm", "http://www.samair.ru/proxy/type-05.htm", "http://www.samair.ru/proxy/type-06.htm", "http://www.samair.ru/proxy/type-07.htm", "http://www.samair.ru/proxy/type-08.htm", "http://www.samair.ru/proxy/type-09.htm", "http://www.samair.ru/proxy/type-10.htm", "http://www.samair.ru/proxy/type-11.htm", "http://www.samair.ru/proxy/type-12.htm", "http://www.samair.ru/proxy/type-13.htm", "http://www.samair.ru/proxy/type-14.htm", "http://www.samair.ru/proxy/type-15.htm", "http://www.samair.ru/proxy/type-16.htm", "http://www.samair.ru/proxy/type-17.htm", "http://www.samair.ru/proxy/type-18.htm", "http://www.samair.ru/proxy/type-19.htm", "http://www.samair.ru/proxy/type-20.htm", }; num = sizeof(urls)/sizeof(char *); printf("Proxy downloading is processing...\n"); for(i = 0; i < num; i++){ if(i < 10){ keys[0] = 1; keys[1] = 2; }else{ keys[0] = 0; keys[1] = -1; } print_proxys(urls[i], keys, fp, eflags); } if(proxy_save_count > 0) printf("Now we have downloaded %d proxys, saved in the file %s\n", proxy_save_count, file); else printf("Oops, no proxy saved!\n"); fclose(fp); free_regs(); return 0; } 来自http://www.innohot.com/?p=10 声明:ITeye文章版权属于作者,受法律保护。没有作者书面许可不得转载。
推荐链接
|
|
返回顶楼 | |
发表时间:2011-06-22
写得不错~
|
|
返回顶楼 | |
发表时间:2011-06-29
楼上,多来我的BLOG玩玩吧
|
|
返回顶楼 | |
发表时间:2011-08-06
程序写的不错,给个建议,希望楼主不要介意
在函数print_proxys中,即代码294行: sprintf(send, "GET %s HTTP/1.1\r\nHost: %s\r\nUser-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: deflate\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nKeep-Alive: 115\r\nProxy-Connection: keep-alive\r\n\r\n", url, host); 这个格式化还是挺乱的,但是看你的send 数组定义 char send[512] 如果内容塞入过多,是不是也是会引起send输入越界,sprintf()函数虽然是个利器,但是其跟char* gets(char *s)函数一样存在bug。 如果要用,一般都是保证分配的数组是一个可能的最大值。 呵呵,发表下意见不知道正确不正确 |
|
返回顶楼 | |
发表时间:2011-10-14
你说的没错,严格来讲是应该用字符指针动态分配内存。
好久,没搞C了。 |
|
返回顶楼 | |