提取出某日访问百度次数最多的那K个IP
基本思路:ip最多2^32个,放入内存也要40G,基本不现实。所以只有用外排序,把ip分割成到不同的小文件里,然后统计次数后,汇总。另外,ip本质就是一32bit的数值,不要拘泥于字符串表示的ip。
思路也可参考此处问题一
基于以上思路,看代码:
package com.kingdee.gmis.ips; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Random; import com.kingdee.gmis.common.TopNHeap; import com.kingdee.gmis.data.FileUtils; /** * 提取出某日访问百度次数最多的那K个IP。<br> * 注意,此处简化起见,没有处理重复的情况,即有多个ip出现次数一样多 * * @author Administrator * */ public class MassIP { public static int K10 = 1024 * 10; public static int M1 = 1024 * 1024; public static int M4 = 4 * M1; private static int partationCount = 100; /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { generateMassIp("ip", "ips.txt", 100000000); generatePartitionFile("ip", "ips.txt", 100); searchTopN(10); // searchTopN2("ip", "ips.txt", 10); } /** * 如果是1亿条记录,直接放到内存排序会如何呢? * * @param srcDirName * @param srcFileName * @param count * @throws IOException */ public static void searchTopN2(String srcDirName, String srcFileName, int count) throws IOException { Map<Integer, Integer> ipCountMap = new HashMap<Integer, Integer>(); File file = FileUtils.getFile(srcDirName, srcFileName); BufferedReader br = null; try { br = new BufferedReader(new FileReader(file), M4); String s; int i = 0, j = 0; while ((s = br.readLine()) != null) { int ip = parseIp2Int(s); Integer cnt = ipCountMap.get(ip); if (cnt == null) { i++; if (i % 10000000 == 0) { System.out.println(i); } } j++; if (j % 10000000 == 0) { System.out.println(j); } ipCountMap.put(ip, cnt == null ? 1 : cnt + 1); } } finally { if (br != null) { try { br.close(); } catch (IOException e) { e.printStackTrace(); } } } TopNHeap<IPInfo> heap = new TopNHeap<IPInfo>(count); searchMostCountIps(ipCountMap, heap); printResult(heap); } /** * 查找出现次数最多的K个ip地址 * * @param count * @throws IOException */ public static void searchTopN(int count) throws IOException { File[] smallFiles = getPartitionFile("ip", partationCount); DataInputStream dis = null; Map<Integer, Integer> ipCountMap = new HashMap<Integer, Integer>(); TopNHeap<IPInfo> heap = new TopNHeap<IPInfo>(count); for (int i = 0; i < partationCount; i++) { ipCountMap.clear(); try { dis = new DataInputStream(new BufferedInputStream(new FileInputStream(smallFiles[i]), K10)); while (dis.available() > 0) { int ip = dis.readInt(); Integer cnt = ipCountMap.get(ip); ipCountMap.put(ip, cnt == null ? 1 : cnt + 1); } searchMostCountIps(ipCountMap, heap); } finally { if (dis != null) { try { dis.close(); } catch (IOException e) { e.printStackTrace(); } } } } printResult(heap); } /** * 打印结果集 * * @param heap */ private static void printResult(TopNHeap<IPInfo> heap) { while (heap.hasNext()) { System.out.println(heap.removeTop().toString()); } } /** * 查找出现次数最多的ip * * @param map * @param heap */ private static void searchMostCountIps(Map<Integer, Integer> map, TopNHeap<IPInfo> heap) { Iterator<Integer> iter = map.keySet().iterator(); Integer key = null; while (iter.hasNext()) { key = iter.next(); int count = map.get(key); if (!heap.isFull() || count > heap.getHeapTop().getCount()) { heap.addToHeap(new IPInfo(count, key)); } } } /** * 把32位int值转换成ip字符串 * * @param value * @return */ public static String parseInt2Ip(int value) { StringBuilder sb = new StringBuilder(15); String[] segs = new String[4]; for (int i = 0; i < 4; i++) { segs[3 - i] = String.valueOf((0xFF & value)); value >>= 8; } for (int i = 0; i < 4; i++) { sb.append(segs[i]).append("."); } sb.deleteCharAt(sb.length() - 1); return sb.toString(); } /** * 把ip字符串转换成int值 * * @param ip * @return */ public static int parseIp2Int(String ip) { String[] segs = ip.split("\\."); int rst = 0; for (int i = 0; i < segs.length; i++) { rst = (rst << 8) | Integer.valueOf(segs[i]); } return rst; } /** * 随机生成ip * * @param srcDirName * @param srcFileName * @param count * @throws IOException */ public static void generateMassIp(String srcDirName, String srcFileName, int count) throws IOException { File file = FileUtils.createFile(srcDirName, srcFileName); BufferedWriter bw = null; try { bw = new BufferedWriter(new FileWriter(file)); count = count / 100; Random rd = new Random(255); StringBuilder sb = new StringBuilder(1500); for (int i = 0; i < count; i++) { sb.delete(0, sb.length()); for (int j = 0; j < 100; j++) { sb.append(rd.nextInt(100)).append("."); sb.append(rd.nextInt(150)).append("."); sb.append(rd.nextInt(256)).append("."); sb.append(rd.nextInt(256)).append("\n"); } bw.write(sb.toString()); bw.flush(); } } finally { if (bw != null) { bw.close(); } } } public static File[] getPartitionFile(String srcDirName, int count) throws IOException { File[] files = new File[count]; for (int i = 0; i < count; i++) { files[i] = FileUtils.getFile(srcDirName, i + ".data"); } return files; } /** * 分割大文件到小文件中 * * @param srcDirName * @param srcFileName * @param count * @return * @throws IOException */ public static File[] generatePartitionFile(String srcDirName, String srcFileName, int count) throws IOException { File[] files = new File[count]; DataOutputStream[] dops = new DataOutputStream[count]; for (int i = 0; i < count; i++) { files[i] = FileUtils.createFile(srcDirName, i + ".data"); } File file = FileUtils.getFile(srcDirName, srcFileName); BufferedReader br = null; // Buffered try { for (int i = 0; i < count; i++) { dops[i] = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(files[i]), K10)); } br = new BufferedReader(new FileReader(file), M4); String s; while ((s = br.readLine()) != null) { int ip = parseIp2Int(s); dops[Math.abs(ip % count)].writeInt(ip); } } finally { for (int i = 0; i < count; i++) { if (dops[i] != null) { try { dops[i].close(); } catch (IOException e) { e.printStackTrace(); } } } if (br != null) { try { br.close(); } catch (IOException e) { e.printStackTrace(); } } } return files; } /** * 表示IP信息,存有次数和ip值。在放入堆中时候会用作存储 * * @author Administrator * */ static class IPInfo implements Comparable<IPInfo> { private int count; private int ipValue; public IPInfo(int count, int ipValue) { super(); this.count = count; this.ipValue = ipValue; } public int getCount() { return count; } public void setCount(int count) { this.count = count; } public int getIpValue() { return ipValue; } public void setIpValue(int ipValue) { this.ipValue = ipValue; } public String getIp() { return parseInt2Ip(ipValue); } public int compareTo(IPInfo o) { if (this.count > o.getCount()) { return 1; } else if (this.count < o.getCount()) { return -1; } else { return 0; } } public String toString() { return this.getIp() + " -- " + this.count; } } }
package com.kingdee.gmis.data; import java.io.File; import java.io.IOException; public class FileUtils { public static String directory = "data"; public static String getDirPath() { return System.getProperty("user.dir"); } public static File createFile(String dirName, String fileName) throws IOException { String dataDir = getDirPath() + File.separator + directory; String dir = dirName != null ? dataDir + File.separator + dirName : dataDir; ensureDirectory(dir); String filePath = dir + File.separator + fileName; ensureFile(filePath); return new File(filePath); } public static File getFile(String dirName, String fileName) throws IOException { String dataDir = getDirPath() + File.separator + directory; String filePath = (dirName != null ? dataDir + File.separator + dirName : dataDir) + File.separator + fileName; return new File(filePath); } public static synchronized void ensureFile(String filePath) throws IOException { File dataFile = new File(filePath); if (!dataFile.exists()) { dataFile.createNewFile(); } } public static synchronized void ensureDirectory(String dataDir) { File path = new File(dataDir); if (!path.exists()) { path.mkdirs(); } } /** * @param args */ public static void main(String[] args) { System.out.println(getDirPath()); } }
package com.kingdee.gmis.data.ips; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Random; import com.kingdee.gmis.common.TopNHeap; import com.kingdee.gmis.data.FileUtils; /** * 提取出某日访问百度次数最多的那K个IP。<br> * 注意,此处简化起见,没有处理重复的情况,即有多个ip出现次数一样多 * * @author Administrator * */ public class MassIP { public static int K10 = 1024 * 10; public static int M1 = 1024 * 1024; public static int M4 = 4 * M1; private static int partationCount = 100; /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { generateMassIp("ip", "ips.txt", 100000000); generatePartitionFile("ip", "ips.txt", 100); searchTopN(10); // searchTopN2("ip", "ips.txt", 10); } /** * 如果是1亿条记录,直接放到内存排序会如何呢? * * @param srcDirName * @param srcFileName * @param count * @throws IOException */ public static void searchTopN2(String srcDirName, String srcFileName, int count) throws IOException { Map<Integer, Integer> ipCountMap = new HashMap<Integer, Integer>(); File file = FileUtils.getFile(srcDirName, srcFileName); BufferedReader br = null; try { br = new BufferedReader(new FileReader(file), M4); String s; int i = 0, j = 0; while ((s = br.readLine()) != null) { int ip = parseIp2Int(s); Integer cnt = ipCountMap.get(ip); if (cnt == null) { i++; if (i % 10000000 == 0) { System.out.println(i); } } j++; if (j % 10000000 == 0) { System.out.println(j); } ipCountMap.put(ip, cnt == null ? 1 : cnt + 1); } } finally { if (br != null) { try { br.close(); } catch (IOException e) { e.printStackTrace(); } } } TopNHeap<IPInfo> heap = new TopNHeap<IPInfo>(count); searchMostCountIps(ipCountMap, heap); printResult(heap); } /** * 查找出现次数最多的K个ip地址 * * @param count * @throws IOException */ public static void searchTopN(int count) throws IOException { File[] smallFiles = getPartitionFile("ip", partationCount); DataInputStream dis = null; Map<Integer, Integer> ipCountMap = new HashMap<Integer, Integer>(); TopNHeap<IPInfo> heap = new TopNHeap<IPInfo>(count); for (int i = 0; i < partationCount; i++) { ipCountMap.clear(); try { dis = new DataInputStream(new BufferedInputStream(new FileInputStream(smallFiles[i]), K10)); while (dis.available() > 0) { int ip = dis.readInt(); Integer cnt = ipCountMap.get(ip); ipCountMap.put(ip, cnt == null ? 1 : cnt + 1); } searchMostCountIps(ipCountMap, heap); } finally { if (dis != null) { try { dis.close(); } catch (IOException e) { e.printStackTrace(); } } } } printResult(heap); } /** * 打印结果集 * * @param heap */ private static void printResult(TopNHeap<IPInfo> heap) { while (heap.hasNext()) { System.out.println(heap.removeTop().toString()); } } /** * 查找出现次数最多的ip * * @param map * @param heap */ private static void searchMostCountIps(Map<Integer, Integer> map, TopNHeap<IPInfo> heap) { Iterator<Integer> iter = map.keySet().iterator(); Integer key = null; while (iter.hasNext()) { key = iter.next(); int count = map.get(key); if (!heap.isFull() || count > heap.getHeapTop().getCount()) { heap.addToHeap(new IPInfo(count, key)); } } } /** * 把32位int值转换成ip字符串 * * @param value * @return */ public static String parseInt2Ip(int value) { StringBuilder sb = new StringBuilder(15); String[] segs = new String[4]; for (int i = 0; i < 4; i++) { segs[3 - i] = String.valueOf((0xFF & value)); value >>= 8; } for (int i = 0; i < 4; i++) { sb.append(segs[i]).append("."); } sb.deleteCharAt(sb.length() - 1); return sb.toString(); } /** * 把ip字符串转换成int值 * * @param ip * @return */ public static int parseIp2Int(String ip) { String[] segs = ip.split("\\."); int rst = 0; for (int i = 0; i < segs.length; i++) { rst = (rst << 8) | Integer.valueOf(segs[i]); } return rst; } /** * 随机生成ip * * @param srcDirName * @param srcFileName * @param count * @throws IOException */ public static void generateMassIp(String srcDirName, String srcFileName, int count) throws IOException { File file = FileUtils.createFile(srcDirName, srcFileName); BufferedWriter bw = null; try { bw = new BufferedWriter(new FileWriter(file)); count = count / 100; Random rd = new Random(255); StringBuilder sb = new StringBuilder(1500); for (int i = 0; i < count; i++) { sb.delete(0, sb.length()); for (int j = 0; j < 100; j++) { sb.append(rd.nextInt(100)).append("."); sb.append(rd.nextInt(150)).append("."); sb.append(rd.nextInt(256)).append("."); sb.append(rd.nextInt(256)).append("\n"); } bw.write(sb.toString()); bw.flush(); } } finally { if (bw != null) { bw.close(); } } } public static File[] getPartitionFile(String srcDirName, int count) throws IOException { File[] files = new File[count]; for (int i = 0; i < count; i++) { files[i] = FileUtils.getFile(srcDirName, i + ".data"); } return files; } /** * 分割大文件到小文件中 * * @param srcDirName * @param srcFileName * @param count * @return * @throws IOException */ public static File[] generatePartitionFile(String srcDirName, String srcFileName, int count) throws IOException { File[] files = new File[count]; DataOutputStream[] dops = new DataOutputStream[count]; for (int i = 0; i < count; i++) { files[i] = FileUtils.createFile(srcDirName, i + ".data"); } File file = FileUtils.getFile(srcDirName, srcFileName); BufferedReader br = null; // Buffered try { for (int i = 0; i < count; i++) { dops[i] = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(files[i]), K10)); } br = new BufferedReader(new FileReader(file), M4); String s; while ((s = br.readLine()) != null) { int ip = parseIp2Int(s); dops[Math.abs(ip % count)].writeInt(ip); } } finally { for (int i = 0; i < count; i++) { if (dops[i] != null) { try { dops[i].close(); } catch (IOException e) { e.printStackTrace(); } } } if (br != null) { try { br.close(); } catch (IOException e) { e.printStackTrace(); } } } return files; } /** * 表示IP信息,存有次数和ip值。在放入堆中时候会用作存储 * * @author Administrator * */ static class IPInfo implements Comparable<IPInfo> { private int count; private int ipValue; public IPInfo(int count, int ipValue) { super(); this.count = count; this.ipValue = ipValue; } public int getCount() { return count; } public void setCount(int count) { this.count = count; } public int getIpValue() { return ipValue; } public void setIpValue(int ipValue) { this.ipValue = ipValue; } public String getIp() { return parseInt2Ip(ipValue); } public int compareTo(IPInfo o) { if (this.count > o.getCount()) { return 1; } else if (this.count < o.getCount()) { return -1; } else { return 0; } } public String toString() { return this.getIp() + " -- " + this.count; } } }
package com.kingdee.gmis.common; import java.util.Random; /** * * @author hua_fan * */ public class TopNHeap<T extends Comparable<? super T>> { private int size; private int maxSize; private Object[] data = new Object[256]; public TopNHeap(T[] data) { super(); this.initHeap(data); } public TopNHeap(int fixedSize) { super(); assert fixedSize >= 1; this.maxSize = fixedSize; } @SuppressWarnings("unchecked") public void initHeap(T[] data) { assert data.length >= 1; if (this.data.length <= this.size) { this.data = new Object[(int) (data.length * 1.5)]; } this.maxSize = this.size = data.length; System.arraycopy(data, 0, this.data, 0, this.size); int startPos = this.getParentIndex(this.size - 1); for (int i = startPos; i >= 0; i--) { this.shiftdown(i); } } @SuppressWarnings("unchecked") public T getHeapTop() { return (T) this.data[0]; } /** * 加元素到堆中,但是保持堆 的大小 * * @param value */ public void addToHeap(T value) { if (this.maxSize > this.size) { this.data[this.size] = value; this.shiftup(this.size++); } else { if (value.compareTo(this.getHeapTop()) > 0) { this.data[0] = value; this.shiftdown(0); } } } private void shiftup(int pos) { int parentIdx = this.getParentIndex(pos); while (pos != 0 && this.getValue(pos).compareTo(this.getValue(parentIdx)) < 0) { this.swap(pos, parentIdx); pos = parentIdx; parentIdx = this.getParentIndex(pos); } } public T removeTop() { T rst = this.getHeapTop(); this.data[0] = this.data[--this.size]; this.shiftdown(0); return rst; } public boolean hasNext() { return this.size > 0; } @SuppressWarnings("unchecked") public T[] getData() { return (T[]) this.data; } @SuppressWarnings("unchecked") public T getValue(int index) { return (T) this.data[index]; } private int getParentIndex(int pos) { return (pos - 1) / 2; } private int getLeftChildIdx(int pos) { return pos * 2 + 1; } private int getRightChildIdx(int pos) { return pos * 2 + 2; } private void swap(int idx1, int idx2) { T tmp = this.getValue(idx1); this.data[idx1] = this.getValue(idx2); this.data[idx2] = tmp; } /** * 节点值向下级交换,构造堆 * * @param pos */ private void shiftdown(int pos) { int leftChildIdx = this.getLeftChildIdx(pos); // 没有子节点了 if (leftChildIdx >= this.size) { return; } int rightChildIdx = getRightChildIdx(pos); int toBeSwapIdx = leftChildIdx; if (rightChildIdx < this.size && this.getValue(leftChildIdx).compareTo( this.getValue(rightChildIdx)) > 0) { toBeSwapIdx = rightChildIdx; } // swap if (this.getValue(pos).compareTo(this.getValue(toBeSwapIdx)) > 0) { this.swap(pos, toBeSwapIdx); this.shiftdown(toBeSwapIdx); } } public boolean isFull(){ return this.maxSize == this.size; } public int getMaxSize() { return maxSize; } // /** * @param args */ public static void main(String[] args) { Integer[] data = { 7, 12, 13, 24, 8, 6, 4, 27, 14, 8, 12, 56, 22 }; TopNHeap<Integer> heap = new TopNHeap<Integer>(data); while (heap.hasNext()) { System.out.print(heap.removeTop()); System.out.print(" "); } System.out.println(" "); heap.initHeap(data); for (int i = 0; i < 10; i++) { heap.addToHeap(i); } while (heap.hasNext()) { System.out.print(heap.removeTop()); System.out.print(" "); } System.out.println(" "); heap = new TopNHeap<Integer>(10); Random rd = new Random(); for (int i = 0; i < 20; i++) { int value = rd.nextInt(100); // System.out.print(value); // System.out.print(" "); heap.addToHeap(value); } System.out.println(" "); while (heap.hasNext()) { System.out.print(heap.removeTop()); System.out.print(" "); } } }
package com.kingdee.gmis.common; /** * * @author hua_fan * */ public class HeapSort<T extends Comparable<? super T>> { private int size; private Object[] data = new Object[256]; public HeapSort(T[] data) { super(); this.initHeap(data); } @SuppressWarnings("unchecked") public void initHeap(T[] data) { assert data.length >= 1; ensureCapacity(); this.size = data.length; System.arraycopy(data, 0, this.data, 0, this.size); int startPos = this.getParentIndex(this.size - 1); for (int i = startPos; i >= 0; i--) { this.shiftdown(i); } } @SuppressWarnings("unchecked") public T getHeapTop() { return (T) this.data[0]; } /** * 加元素到堆中,但是保持堆 的大小 * * @param value */ public void addToHeapKeepFixedSize(T value) { if (this.getHeapTop() == null || value.compareTo(this.getHeapTop()) > 0) { this.data[0] = value; this.shiftdown(0); } } /** * 加元素到堆中,同时扩展堆的大小 * * @param value */ public void addToHeap(T value) { this.ensureCapacity(); this.data[this.size++] = value; // this.shiftdown(this.getParentIndex(this.size)); this.shiftup(this.size - 1); } private void shiftup(int pos) { int parentIdx = this.getParentIndex(pos); while (pos != 0 && this.getValue(pos).compareTo(this.getValue(parentIdx)) < 0) { this.swap(pos, parentIdx); pos = parentIdx; parentIdx = this.getParentIndex(pos); } } private void ensureCapacity() { if (this.data.length <= this.size) { Object[] oldData = this.data; this.data = new Object[(int) (data.length * 1.5)]; System.arraycopy(oldData, 0, this.data, 0, oldData.length); } } public T removeTop() { T rst = this.getHeapTop(); this.data[0] = this.data[--this.size]; this.shiftdown(0); return rst; } public boolean hasNext() { return this.size > 0; } @SuppressWarnings("unchecked") public T[] getData() { return (T[]) this.data; } @SuppressWarnings("unchecked") public T getValue(int index) { return (T) this.data[index]; } private int getParentIndex(int pos) { return (pos - 1) / 2; } private int getLeftChildIdx(int pos) { return pos * 2 + 1; } private int getRightChildIdx(int pos) { return pos * 2 + 2; } private void swap(int idx1, int idx2) { T tmp = this.getValue(idx1); this.data[idx1] = this.getValue(idx2); this.data[idx2] = tmp; } /** * 节点值向下级交换,构造堆 * * @param pos */ private void shiftdown(int pos) { int leftChildIdx = this.getLeftChildIdx(pos); // 没有子节点了 if (leftChildIdx >= this.size) { return; } int rightChildIdx = getRightChildIdx(pos); int toBeSwapIdx = leftChildIdx; if (rightChildIdx < this.size && this.getValue(leftChildIdx).compareTo(this.getValue(rightChildIdx)) > 0) { toBeSwapIdx = rightChildIdx; } // swap if (this.getValue(pos).compareTo(this.getValue(toBeSwapIdx)) > 0) { this.swap(pos, toBeSwapIdx); this.shiftdown(toBeSwapIdx); } } /** * @param args */ public static void main(String[] args) { Integer[] data = { 7, 12, 13, 24, 8, 6, 4, 27, 14, 8, 12, 56, 22 }; HeapSort<Integer> heap = new HeapSort<Integer>(data); while (heap.hasNext()) { System.out.print(heap.removeTop()); System.out.print(" "); } System.out.println(" "); heap.initHeap(data); for (int i = 0; i < 10; i++) { heap.addToHeap(i); } while (heap.hasNext()) { System.out.print(heap.removeTop()); System.out.print(" "); } System.out.println(" "); heap.initHeap(data); for (int i = 0; i < 10; i++) { heap.addToHeapKeepFixedSize(i); } while (heap.hasNext()) { System.out.print(heap.removeTop()); System.out.print(" "); } } }
以上代码用到了前边文章中提到的top K堆
此方法,内存占用情况也非常理想,高峰不到100m内存就搞定了。
扩展:
如果我们是排序1亿条ip,算下是1*4/10 = 0.4G,那么说400+m的内存也够把所有的1亿条ip的统计都放入到内存当中了?上边代码中searchTopN2是这么做了下,但是情况很不理想,读入2kw条记录的时候,内存已经占用到了快1G:
这是什么原因呢?虽然是java对象,但是分析下,HashMap的key和value都是Integer,从下图Integer的非静态变量和非静态方法看,直观分析也就value这个field占用一个字节嘛,两个Integer顶多也就是占用800+m的内存嘛,为何会1g才读入2kw整数呢?
这个问题,且等我们继续深究!
另,本题目的多线程版本后续贴出!
转http://yueyemaitian.iteye.com/blog/1180299
相关推荐
1.提取出某日访问百度次数最多的那个IP 2.有一个1G大小的一个文件,里面每一行是一个词 3.给定a、b两个文件,各存放50亿个url,每个url各占64字节,内存限制是4G,让你找出a、b文件共同的url? 4.在2.5亿个整数中找出...
在这个场景中,我们有海量日志数据,问题是如何提取出某日访问百度次数最多的那个 IP? 解决方案 1:首先是这一天,并且是访问百度的日志中的 IP 取出来,逐个写入到一个大文件中,然后采用映射的方法,找出每个小...
示例中的代码均基于python2.7.10##问题1海量日志,提取出某日访问百度次数最多的那个IP。 ##问题2寻找热门查询,300万个查询字符串中统计最热门的10个查询。 ##问题3有一个1G大小的一个文件,里面每行是一个词,词的...
第一部分、十道海量数据处理面试题 1、海量日志数据,提取出某日访问百度次数最多的那个IP。 此题,在我之前的一篇文章算法里头有所提到,当时给出的方案是:IP的数目还是有限的,最多2^32个,所以可以考虑使用hash...
大数据面试题目的解决方法可以应用于实际的数据处理中,例如,在海量日志数据中,使用Bloom filter可以快速判断某个元素是否存在于一个集合中,而使用Hashing可以快速提取出某日访问百度次数最多的那个IP。...
1. 提取出某日访问百度次数最多的IP 这是一个典型的分布式计算问题。首先,通过对IP地址取模将海量日志分散到多个小文件中,例如模1000,生成1000个小文件。然后,对每个小文件使用哈希映射(如hash_map)统计IP地址...
一、海量日志数据,提取出某日访问百度次数最多的那个 IP 这是一个典型的 hash 表应用问题,解决方案是使用 hash 表将 IP 直接存入内存,然后进行统计。在实际操作中,可以将大文件映射为多个小文件,然后找出每个...
海量日志数据,提取出某日访问百度次数最多的那个IP。 方案 1:首先是这一天,并且是访问百度的日志中的IP 取出来,逐个写入到一个大文件中。注意到 IP 是 32 位的,最多有个IP。同样可以采用映射的方法,比如模...
对于海量日志数据,例如需要提取出某日访问百度次数最多的那个IP,可以使用映射的方法,即%1000将整个大文件映射为1000个小文件,然后逐个写入到一个大文件中,然后对每个小文件中的所有IP进行频率统计,最后在这...
提取出某日访问百度次数最多的IP #### 题目描述 从海量日志数据中,找出某天访问百度次数最多的IP地址。 #### 解答思路 - 使用哈希策略将日志数据分布到多个小文件中,例如通过IP地址的哈希值对1024取模,得到...
1. 提取出某日访问百度次数最多的 IP: 这题通常需要使用流式处理,如 MapReduce 或者 Hadoop。可以先通过 Map 阶段对每个 IP 进行计数,然后在 Reduce 阶段汇总并找出最大值。 2. 找出频数最高的 100 个词: ...
4. 海量日志数据分析:有海量日志数据,要求提取出某日访问百度次数最多的那个 IP。解决方案是首先提取出这一天的日志中的 IP,然后使用映射的方法将其分割为 1000 个小文件,最后找出每个小文件中出现频率最大的 IP...
2. 海量日志数据,提取出某日访问百度次数最多的那个 IP。 3. 如何根据输入元素个数 n,确定位数组 m 的大小及哈希函数个数 k。 这些问题可以使用上述方法来解决,例如使用 Bloom filter 或 Hashing 等方法来查找、...
提取某日访问百度次数最多的那个IP** **问题描述**:给定一天内的大量日志数据,需要从中提取出访问百度次数最多的那个IP。 **解决方案**:考虑到IP地址的数量有限,最多2^32个,可以考虑使用哈希表(如`hash_map...
4. 给定海量日志数据,需要提取出某日访问百度次数最多的那个IP。解决方法是使用HashMap来统计每个IP的访问次数,然后使用堆排序来输出访问次数最多的IP。 5. 给定2.5亿个整数,需要找出不重复的整数,内存空间不...
**1、海量日志数据,提取出某日访问百度次数最多的那个IP** - **问题概述**: 给定一天内的海量日志数据,从中找出访问百度次数最多的IP地址。 - **解决方案**: - **初步思路**: 将所有IP地址写入一个大文件,考虑...
问题实例:海量日志数据,提取出某日访问百度次数最多的那个IP。IP的数目还是有限的,最多2^32个,所以可以考虑使用hash将ip直接存入内存,然后进行统计。 3.bit-map: Bit-map是一种常见的大数据处理方法,适用...