for (i = 0; i < depth; i++) { // generate new segment //根据传入参数depth来决定循环次数,生成segment Path[] segs = generator.generate(crawlDb, segments, -1, topN, System .currentTimeMillis()); if (segs == null) { LOG.info("Stopping at depth=" + i + " - no more URLs to fetch."); break; } fetcher.fetch(segs[0], threads); // fetch it if (!Fetcher.isParsing(job)) { parseSegment.parse(segs[0]); // parse it, if needed } crawlDbTool.update(crawlDb, segs, true, true); // update crawldb }
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { //生成临时存储路径 Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); //生成文件锁 Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))){ LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead"); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); //用户如果没有指定的话,就默认为map的数量 if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } 如果mapreduce设置为local,就只用一个mapper if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } 设置生成时间 job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); 配置作业信息 FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { throw e; } ................... ................... ................... }
/** Select & invert subset due for fetch. */ public void map(Text key, CrawlDatum value, OutputCollector<FloatWritable,SelectorEntry> output, Reporter reporter) throws IOException { Text url = key; //如果有filter设置,先对url进行过滤 if (filter) { // If filtering is on don't generate URLs that don't pass // URLFilters try { if (filters.filter(url.toString()) == null) return; } catch (URLFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")"); } } } CrawlDatum crawlDatum = value; // check fetch schedule //检查抓取时间,没有达到抓取时间就过滤掉 if (!schedule.shouldFetch(url, crawlDatum, curTime)) { LOG.debug("-shouldFetch rejected '" + url + "', fetchTime=" + crawlDatum.getFetchTime() + ", curTime=" + curTime); return; } LongWritable oldGenTime = (LongWritable) crawlDatum.getMetaData().get( Nutch.WRITABLE_GENERATE_TIME_KEY); if (oldGenTime != null) { // awaiting fetch & update if (oldGenTime.get() + genDelay > curTime) // still wait for // update return; } //计算得分 float sort = 1.0f; try { sort = scfilters.generatorSortValue((Text) key, crawlDatum, sort); } catch (ScoringFilterException sfe) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't filter generatorSortValue for " + key + ": " + sfe); } } if (restrictStatus != null && !restrictStatus.equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus()))) return; // consider only entries with a score superior to the threshold //如果分值小于阀值,过滤掉 if (scoreThreshold != Float.NaN && sort < scoreThreshold) return; // consider only entries with a retry (or fetch) interval lower than threshold if (intervalThreshold != -1 && crawlDatum.getFetchInterval() > intervalThreshold) return; // sort by decreasing score, using DecreasingFloatComparator sortValue.set(sort); // record generation time //记录生成时间 crawlDatum.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime); entry.datum = crawlDatum; entry.url = (Text) key; output.collect(sortValue, entry); // invert for sort by score }
/** Partition by host / domain or IP. */ //根据domain或ip 来分配给reduce public int getPartition(FloatWritable key, Writable value, int numReduceTasks) { return partitioner.getPartition(((SelectorEntry) value).url, key, numReduceTasks); }
public void configure(JobConf job) { curTime = job.getLong(GENERATOR_CUR_TIME, System.currentTimeMillis()); limit = job.getLong(GENERATOR_TOP_N, Long.MAX_VALUE) / job.getNumReduceTasks(); maxCount = job.getInt(GENERATOR_MAX_COUNT, -1); // back compatibility with old param int oldMaxPerHost = job.getInt(GENERATE_MAX_PER_HOST, -1); if (maxCount==-1 && oldMaxPerHost!=-1){ maxCount = oldMaxPerHost; byDomain = false; } if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE))) byDomain = true; filters = new URLFilters(job); normalise = job.getBoolean(GENERATOR_NORMALISE, true); if (normalise) normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_GENERATE_HOST_COUNT); scfilters = new ScoringFilters(job); partitioner.configure(job); filter = job.getBoolean(GENERATOR_FILTER, true); genDelay = job.getLong(GENERATOR_DELAY, 7L) * 3600L * 24L * 1000L; long time = job.getLong(Nutch.GENERATE_TIME_KEY, 0L); if (time > 0) genTime.set(time); schedule = FetchScheduleFactory.getFetchSchedule(job); scoreThreshold = job.getFloat(GENERATOR_MIN_SCORE, Float.NaN); intervalThreshold = job.getInt(GENERATOR_MIN_INTERVAL, -1); restrictStatus = job.get(GENERATOR_RESTRICT_STATUS, null); maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1); segCounts = new int[maxNumSegments]; }
/** Collect until limit is reached. */ public void reduce(FloatWritable key, Iterator<SelectorEntry> values, OutputCollector<FloatWritable,SelectorEntry> output, Reporter reporter) throws IOException { while (values.hasNext()) { if (count == limit) { // do we have any segments left? if (currentsegmentnum < maxNumSegments) { count = 0; currentsegmentnum++; } else break; } SelectorEntry entry = values.next(); Text url = entry.url; String urlString = url.toString(); URL u = null; String hostordomain = null; try { if (normalise && normalizers != null) { urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_GENERATE_HOST_COUNT); } u = new URL(urlString); if (byDomain) { hostordomain = URLUtil.getDomainName(u); } else { hostordomain = new URL(urlString).getHost(); } } catch (Exception e) { LOG.warn("Malformed URL: '" + urlString + "', skipping (" + StringUtils.stringifyException(e) + ")"); reporter.getCounter("Generator", "MALFORMED_URL").increment(1); continue; } hostordomain = hostordomain.toLowerCase(); // only filter if we are counting hosts or domains if (maxCount > 0) { int[] hostCount = hostCounts.get(hostordomain); if (hostCount == null) { hostCount = new int[] {1, 0}; hostCounts.put(hostordomain, hostCount); } // increment hostCount hostCount[1]++; // check if topN reached, select next segment if it is while (segCounts[hostCount[0]-1] >= limit && hostCount[0] < maxNumSegments) { hostCount[0]++; hostCount[1] = 0; } // reached the limit of allowed URLs per host / domain // see if we can put it in the next segment? if (hostCount[1] >= maxCount) { if (hostCount[0] < maxNumSegments) { hostCount[0]++; hostCount[1] = 0; } else { if (hostCount[1] == maxCount + 1 && LOG.isInfoEnabled()) { LOG.info("Host or domain " + hostordomain + " has more than " + maxCount + " URLs for all " + maxNumSegments + " segments. Additional URLs won't be included in the fetchlist."); } // skip this entry continue; } } entry.segnum = new IntWritable(hostCount[0]); segCounts[hostCount[0]-1]++; } else { entry.segnum = new IntWritable(currentsegmentnum); segCounts[currentsegmentnum-1]++; } output.collect(key, entry); // Count is incremented only when we keep the URL // maxCount may cause us to skip it. count++; } } }
可以通过上述代码看出,Generator的第一个Job,实现的逻辑如下:
1.根据条件过滤不满足的
2.根据配置生成相应数量的segments
3.计算出每个url所属的segments
相关推荐
### Nutch 1.2 源码阅读深入解析 #### Crawl类核心作用与流程概览 在深入了解Nutch 1.2源码之前,我们先明确Nutch的架构和工作流程。Nutch作为一款开源搜索引擎框架,其功能涵盖网页抓取、索引构建以及查询处理。...
6. **国际化支持**:Nutch 支持多种语言的网页抓取和处理,包括中文,通过插件机制可以添加新的语言处理模块。 7. **插件架构**:Nutch 的灵活性很大程度上得益于其插件体系,开发者可以编写自定义插件来扩展其功能...
Nutch 是一个高度可扩展且开放源码的网络爬虫项目,主要用于抓取和索引互联网上的数据。本篇将基于提供的文件内容对 Nutch 的参数设置进行深入解析,帮助读者更好地理解 Nutch 中各个组件的工作原理及配置方式。 ##...
3. **Fetcher**:`Fetcher` 根据 `Generator` 生成的抓取列表下载网页,`Fetcher` 命令可设置线程数来控制并发抓取,下载后的网页源码存放在 `content` 文件夹,状态信息存放在 `crawl_fetch` 文件夹。 4. **Parse*...
在深入理解 Nutch 的工作原理之前,了解其源码是至关重要的。本文将解析 Nutch-0.9 版本中的 `Crawl` 类,它是 Nutch 抓取流程的起点。 `Crawl` 类位于 `org.apache.nutch.crawl` 包中,它包含了启动 Nutch 抓取...