本程序可以完成的工作:转移csdn上面的文章(限于文本内容)到wordpress;不能完成的工作:1、不支持在wordpress上创建分类,所以需要提前在wordpress上手工创建分类(保持与csdn一致);2、不能以很好的格式转移文章,转移之后文章格式需要调整。

程序由采集、解析、发帖三部分构成。采集负责将指定url的内容下载下来,解析负责从网页内容中解析出正文链接、标题、发布时间、分类信息,发帖部分负责将解析出来的数据通过rpc发送给wordpress,生成博文。

本程序用到的jar包及其版本如下:

-rw-r--r-- 1 mingyuan mingyuan  46725 2011-09-03 23:05 commons-codec-1.3.jar
-rw-r--r-- 1 mingyuan mingyuan 279781 2011-09-03 23:05 commons-httpclient-3.0.1.jar
-rwxrwxrwx 1 mingyuan mingyuan  52915 2010-05-03 03:39 commons-logging-1.1.jar
-rw-r--r-- 1 mingyuan mingyuan 281579 2011-09-04 01:40 jsoup-1.6.1.jar
-rwxrwxrwx 1 mingyuan mingyuan  34407 2010-05-03 03:39 ws-commons-util-1.0.2.jar
-rwxrwxrwx 1 mingyuan mingyuan  58573 2010-05-03 03:39 xmlrpc-client-3.1.3.jar
-rwxrwxrwx 1 mingyuan mingyuan 109131 2010-05-03 03:39 xmlrpc-common-3.1.3.jar
-rwxrwxrwx 1 mingyuan mingyuan  81555 2010-05-03 03:39 xmlrpc-server-3.1.3.jar

代码很简单,就不解释了,大伙看看即可明白。程序的入口函数是Mover.main

下面先给出主要的类Mover.java

  1. packagecn.mingyuan.csdn2wordpress;
  2. importjava.io.IOException;
  3. importjava.net.MalformedURLException;
  4. importjava.net.URL;
  5. importjava.text.ParseException;
  6. importjava.text.SimpleDateFormat;
  7. importjava.util.Date;
  8. importjava.util.HashMap;
  9. importjava.util.LinkedList;
  10. importjava.util.List;
  11. importjava.util.Map;
  12. importjava.util.concurrent.TimeUnit;
  13. importorg.apache.xmlrpc.XmlRpcException;
  14. importorg.apache.xmlrpc.client.XmlRpcClient;
  15. importorg.apache.xmlrpc.client.XmlRpcClientConfigImpl;
  16. importorg.jsoup.Jsoup;
  17. importorg.jsoup.nodes.Document;
  18. importorg.jsoup.nodes.Element;
  19. importorg.jsoup.select.Elements;
  20. /**
  21. *采集、解析、转移
  22. *
  23. *@authormingyuan
  24. *
  25. */
  26. publicclassMover{
  27. privateinttotalPages;
  28. privateXmlRpcClientConfigImplconfig;
  29. privateXmlRpcClientclient;
  30. privateStringbaseUrl;
  31. privateObjectuserName;
  32. privateObjectpassword;
  33. privateStringcsdnUserName;
  34. publicMover(inttotalPages,StringblogRpcUrl,StringcsdnUrl,StringcsdnUserName,StringuserName,
  35. Stringpassword){
  36. this.totalPages=totalPages;
  37. this.baseUrl=csdnUrl;
  38. this.csdnUserName=csdnUserName;
  39. this.userName=userName;
  40. this.password=password;
  41. config=newXmlRpcClientConfigImpl();
  42. try{
  43. config.setServerURL(newURL(blogRpcUrl));
  44. }catch(MalformedURLExceptione){
  45. System.out.println(“请检查url”);
  46. }
  47. client=newXmlRpcClient();
  48. client.setConfig(config);
  49. }
  50. privateList<String>getlinks(){
  51. List<String>list=newLinkedList<String>();
  52. for(inti=1;i<=totalPages;i++){
  53. System.out.println(“processingpage”+i);
  54. Downloaderdownloader=newDownloader();
  55. Stringcontent=downloader.download(baseUrl+“/”+csdnUserName+“/article/list/”+i);
  56. if(content==null)
  57. continue;
  58. Documentdoc=Jsoup.parse(content);
  59. Elementsfirst=doc.select(“.link_title”);
  60. for(intj=0;j<first.size();j++){
  61. Elementfirst2=first.get(j).select(“a”).first();
  62. Stringlink=baseUrl+first2.attr(“href”);
  63. list.add(link);
  64. System.out.println(“getlink\t”+link);
  65. }
  66. System.out.println(“page”+i+“extractordone,sleep2s”);
  67. try{
  68. TimeUnit.SECONDS.sleep(1);
  69. }catch(InterruptedExceptione){
  70. e.printStackTrace();
  71. }
  72. }
  73. returnlist;
  74. }
  75. publicList<CSDNPost>getPosts(){
  76. List<String>links=getlinks();
  77. List<CSDNPost>posts=newLinkedList<CSDNPost>();
  78. for(Stringlink:links){
  79. CSDNPostpost=getPost(link);
  80. if(post!=null){
  81. posts.add(post);
  82. }
  83. }
  84. returnposts;
  85. }
  86. privateCSDNPostgetPost(Stringurl){
  87. System.out.println(“url\t”+url);
  88. Downloaderdownloader=newDownloader();
  89. Stringhtml=downloader.download(url);
  90. if(html==null)
  91. returnnull;
  92. Documentdoc=Jsoup.parse(html);
  93. Stringtitle=doc.select(“.article_title”).first().text();
  94. Stringcategroy=“Uncategorized”;
  95. Elementslink_categories=doc.select(“.article_manage.link_categories”);
  96. if(link_categories!=null){
  97. Elementfirst=link_categories.first();
  98. if(first!=null){
  99. Elementshref=first.select(“a”);
  100. if(href!=null){
  101. categroy=href.text();
  102. }
  103. }
  104. }
  105. Stringpostdate=doc.select(“.article_manage.link_postdate”).first().text();
  106. Stringcontent=doc.select(“.details.article_content”).first().text();
  107. SimpleDateFormatsdf=newSimpleDateFormat(“yyyy-MM-ddHH:mm”);
  108. CSDNPostpost=newCSDNPost();
  109. post.setCategories(newString[]{categroy});
  110. post.setTitle(title);
  111. try{
  112. post.setDateCreated(sdf.parse(postdate));
  113. }catch(ParseExceptione){
  114. post.setDateCreated(newDate());
  115. }
  116. post.setDescription(content);
  117. returnpost;
  118. }
  119. publicvoidpublish(CSDNPostpost){
  120. Map<String,Object>struct=newHashMap<String,Object>();
  121. struct.put(“dateCreated”,post.getDateCreated());
  122. struct.put(“description”,post.getDescription());
  123. struct.put(“title”,post.getTitle());
  124. struct.put(“categories”,post.getCategories());
  125. Object[]params=newObject[]{userName,userName,password,struct,true};
  126. Stringblogid=null;
  127. try{
  128. blogid=(String)client.execute(“metaWeblog.newPost”,params);
  129. }catch(XmlRpcExceptione){
  130. e.printStackTrace();
  131. System.out.println(“导入出现错误:title=”+post.getTitle());
  132. }
  133. System.out.println(post.getTitle()+“>>导入完毕,生成博文id为>>”+blogid);
  134. struct.clear();
  135. }
  136. publicstaticvoidmain(String[]args)throwsIOException{
  137. Moverextractor=newMover(19,“http://youthmemo.com/xmlrpc.php”,“http://blog.csdn.net”,“telnetor”,“admin”,
  138. “xxxx”);
  139. List<CSDNPost>posts=extractor.getPosts();
  140. for(CSDNPostpost:posts){
  141. extractor.publish(post);
  142. try{
  143. TimeUnit.SECONDS.sleep(1);
  144. }catch(InterruptedExceptione){
  145. e.printStackTrace();
  146. }
  147. System.out.println(post.getTitle());
  148. }
  149. System.out.println(“done!”);
  150. }
  151. }

下面给出下载类Downloader.java

  1. packagecn.mingyuan.csdn2wordpress;
  2. importjava.io.BufferedReader;
  3. importjava.io.IOException;
  4. importjava.io.InputStreamReader;
  5. importorg.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
  6. importorg.apache.commons.httpclient.HttpClient;
  7. importorg.apache.commons.httpclient.HttpException;
  8. importorg.apache.commons.httpclient.HttpMethod;
  9. importorg.apache.commons.httpclient.HttpStatus;
  10. importorg.apache.commons.httpclient.cookie.CookiePolicy;
  11. importorg.apache.commons.httpclient.methods.GetMethod;
  12. importorg.apache.commons.httpclient.params.HttpClientParams;
  13. importorg.apache.commons.httpclient.params.HttpMethodParams;
  14. /**
  15. *downloader
  16. *
  17. *@authormingyuan
  18. *
  19. */
  20. publicclassDownloader{
  21. privateHttpClientParamsparams=null;
  22. privateHttpClientclient=null;
  23. /**
  24. *默认构造函数,初始化一系列变量
  25. */
  26. publicDownloader(){
  27. //构造HttpClientParams参数
  28. params=newHttpClientParams();
  29. params.setParameter(
  30. HttpClientParams.USER_AGENT,
  31. “Mozilla/5.0(Windows;U;WindowsNT5.1;zh-CN;rv:1.9.2.3)Gecko/20100401Firefox/3.6.3GTBDFffGTB7.0(.NETCLR3.5.30729)”);
  32. params.setParameter(HttpClientParams.ALLOW_CIRCULAR_REDIRECTS,false);
  33. params.setParameter(HttpClientParams.MAX_REDIRECTS,4);
  34. params.setParameter(HttpClientParams.CONNECTION_MANAGER_TIMEOUT,(long)60*1000);
  35. params.setParameter(HttpClientParams.SO_TIMEOUT,60*1000);
  36. //使用系统提供的默认的恢复策略
  37. params.setParameter(HttpMethodParams.RETRY_HANDLER,newDefaultHttpMethodRetryHandler());
  38. client=newHttpClient(params);
  39. }
  40. /**
  41. *下载网页
  42. *
  43. *@paramurl
  44. *网页url
  45. *@returnString类型的网页源码
  46. */
  47. publicStringdownload(Stringurl){
  48. HttpMethodmethod=newGetMethod(url);
  49. StringsourceCode=null;
  50. method.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
  51. //读取内容
  52. StringBuilderbuilder=newStringBuilder();
  53. BufferedReaderreader=null;
  54. try{
  55. intstatusCode=client.executeMethod(method);
  56. if(statusCode!=HttpStatus.SC_OK){
  57. returnnull;
  58. }
  59. reader=newBufferedReader(newInputStreamReader(method.getResponseBodyAsStream(),“utf8″));
  60. Stringline;
  61. while((line=reader.readLine())!=null){
  62. builder.append(line+“\r\n”);
  63. }
  64. sourceCode=builder.toString();
  65. }catch(HttpExceptione){
  66. e.printStackTrace();
  67. }catch(IOExceptione){
  68. e.printStackTrace();
  69. }finally{
  70. try{
  71. reader.close();
  72. }catch(IOExceptione){
  73. e.printStackTrace();
  74. }
  75. //释放连接
  76. method.releaseConnection();
  77. client.getHttpConnectionManager().closeIdleConnections(0);
  78. }
  79. returnsourceCode;
  80. }
  81. }

最后发出一个pojo,CSDNPost.java

  1. packagecn.mingyuan.csdn2wordpress;
  2. importjava.util.Date;
  3. /**
  4. *csdnpost
  5. *
  6. *@authormingyuan
  7. *
  8. */
  9. publicclassCSDNPost{
  10. /**
  11. *博文创建日期
  12. */
  13. privateDatedateCreated;
  14. /**
  15. *博文内容
  16. */
  17. privateStringdescription;
  18. /**
  19. *标题
  20. */
  21. privateStringtitle;
  22. /**
  23. *博文分类
  24. */
  25. privateString[]categories;
  26. publicCSDNPost(){
  27. }
  28. publicCSDNPost(Stringtitle,Stringdescription,String[]categories,DatedateCreated){
  29. this.dateCreated=dateCreated;
  30. this.description=description;
  31. this.title=title;
  32. this.categories=categories;
  33. }
  34. publicDategetDateCreated(){
  35. returndateCreated;
  36. }
  37. publicvoidsetDateCreated(DatedateCreated){
  38. this.dateCreated=dateCreated;
  39. }
  40. publicStringgetDescription(){
  41. returndescription;
  42. }
  43. publicvoidsetDescription(Stringdescription){
  44. this.description=description;
  45. }
  46. publicStringgetTitle(){
  47. returntitle;
  48. }
  49. publicvoidsetTitle(Stringtitle){
  50. this.title=title;
  51. }
  52. publicString[]getCategories(){
  53. returncategories;
  54. }
  55. publicvoidsetCategories(String[]categories){
  56. this.categories=categories;
  57. }
  58. }

以上是全部源码。

在文章的结尾,我愿意跟大家分享一下这个小程序的开发心得。

一开始写这个程序的时候,觉得会很快搞定,因为这个程序无非就是三个过程:采集、解析、发帖。其实也真是这样的一个过程。

这个程序耗费精力比较多的地方是在解析网页、提取链接、标题、内容、发布时间、分类方面。

一开始想用xpath解析网页,并且写xpath表达式都在chrome上测试通过xpath helper验证通过了。但在编码阶段发现现有的工具包,比如dom4j就不支持对html的解析,网上看了有通过htmlparser将html转换成xml的方法。但觉得太麻烦,最后发现了JSoup这个非常强大的工具,它可以通过类似jquery和css选取语法的表达式来提取内容。尝试了下非常方便,于是解析这个问题没有了(有个小窍门:chrome浏览器开发者工具可以看某节点的css样式,把这个样式直接传递给jsoup就能提取内容)。

wordpress支持MetaWeblog协议,可以通过XML-RPC进行发帖。关于它们的信息可以通过以下链接找到:

http://en.wikipedia.org/wiki/MetaWeblog

http://en.wikipedia.org/wiki/XML-RPC (可以找到各种语言版本的api)

另外JSoup的地址是:

http://jsoup.org/

程序写的太匆忙,肯定有很多不尽人意的地方,希望各位指出。我的联系方式是:admin#youthmemo.com。