`

Excel clawer

 
阅读更多
Sub clawData()
    Dim companies() As String
    Dim rowNum As Long
    Dim keywords As String
    Dim ff As Long
   
    companyNum = Sheets("Sheet1").Range("g65536").End(xlUp).Row
    keywordsGeneral = getGeneralKeyWordsGeneral()
    keywordsEnvironment = getGeneralKeyWordsEnvironment()
    keywordsSocial = getGeneralKeyWordsSocial()
    keywordsGovernance = getGeneralKeyWordsGovernance()
   
    companies() = getCompanyList()
    For ff = 0 To companyNum Step 1
        If companies()(ff) <> "" Then
           keywordsGeneral = clawResult(CStr(keywordsGeneral), "General", CStr(companies()(ff)), ff * 4)
           keywordsGeneral = clawResult(CStr(keywordsEnvironment), "Environment", CStr(companies()(ff)), ff * 4 + 1)
           keywordsGeneral = clawResult(CStr(keywordsSocial), "Social", CStr(companies()(ff)), ff * 4 + 2)
           keywordsGeneral = clawResult(CStr(keywordsGovernance), "Governance", CStr(companies()(ff)), ff * 4 + 3)
        End If
    Next

    Shell ("taskkill /f /im IEXPLORE.exe")
    4
    End Sub
   
Function GetChs(strInput As String) As String
  Dim regEx As Object
  Set regEx = CreateObject("VBSCRIPT.REGEXP")

  regEx.Pattern = "[^\u4e00-\u9fa5]"
  regEx.IgnoreCase = True
  regEx.Global = True
  GetChs = regEx.Replace(strInput, "")
  Set regEx = Nothing
End Function

Function getGeneralKeyWordsGeneral() As String
  Dim d, a, c As Variant
    Dim strT
    Dim strs As String
    Dim array1
    array1 = Array()
    Set d = CreateObject("Scripting.Dictionary")
        For i = 2 To Sheets("Sheet1").Range("h65536").End(xlUp).Row Step 1
           strT = CStr(Sheets("Sheet1").Cells(i,)
           If strT <> "" Then
             d.Add i, strT
           End If
        Next
    For ii = 2 To d.Count + 1 Step 1
       If ii = d.Count + 1 Then
         strs = strs + d.Item(ii)
       Else
         strs = strs + d.Item(ii) + "+OR+"
       End If
    Next
   
getGeneralKeyWordsGeneral = strs


End Function

Function getGeneralKeyWordsEnvironment() As String
  Dim d, a, c As Variant
    Dim strT
    Dim strs As String
    Dim array1
    array1 = Array()
    Set d = CreateObject("Scripting.Dictionary")
        For i = 2 To Sheets("Sheet1").Range("i65536").End(xlUp).Row Step 1
           strT = CStr(Sheets("Sheet1").Cells(i, 9))
           If strT <> "" Then
             d.Add i, strT
           End If
        Next
    For ii = 2 To d.Count + 1 Step 1
       If ii = d.Count + 1 Then
         strs = strs + d.Item(ii)
       Else
         strs = strs + d.Item(ii) + "+OR+"
       End If
    Next
   
getGeneralKeyWordsEnvironment = strs


End Function

Function getGeneralKeyWordsSocial() As String
  Dim d, a, c As Variant
    Dim strT
    Dim strs As String
    Dim array1
    array1 = Array()
    Set d = CreateObject("Scripting.Dictionary")
        For i = 2 To Sheets("Sheet1").Range("j65536").End(xlUp).Row Step 1
           strT = CStr(Sheets("Sheet1").Cells(i, 10))
           If strT <> "" Then
             d.Add i, strT
           End If
        Next
    For ii = 2 To d.Count + 1 Step 1
       If ii = d.Count + 1 Then
         strs = strs + d.Item(ii)
       Else
         strs = strs + d.Item(ii) + "+OR+"
       End If
    Next
   
getGeneralKeyWordsSocial = strs


End Function
Function getGeneralKeyWordsGovernance() As String
  Dim d, a, c As Variant
    Dim strT
    Dim strs As String
    Dim array1
    array1 = Array()
    Set d = CreateObject("Scripting.Dictionary")
        For i = 2 To Sheets("Sheet1").Range("k65536").End(xlUp).Row Step 1
           strT = CStr(Sheets("Sheet1").Cells(i, 11))
           If strT <> "" Then
             d.Add i, strT
           End If
        Next
    For ii = 2 To d.Count + 1 Step 1
       If ii = d.Count + 1 Then
         strs = strs + d.Item(ii)
       Else
         strs = strs + d.Item(ii) + "+OR+"
       End If
    Next
   
getGeneralKeyWordsGovernance = strs


End Function

Function getCompanyList() As String()
  Dim d, a, c As Variant
    Dim strT
    Dim strs As String
    Dim array1(100) As String
    Set d = CreateObject("Scripting.Dictionary")
        For i = 2 To Sheets("Sheet1").Range("g65536").End(xlUp).Row Step 1
           'strT = GetChs(CStr(Sheets("Sheet1").Cells(i, 2)))
           strT = CStr(Sheets("Sheet1").Cells(i, 7))
           If strT <> "" Then
            array1(i - 2) = strT
           End If
        Next



getCompanyList = array1()


End Function
         
         
Function urlVerify(url As String) As Long
    Dim result As Long
    result = 1
    IFind = InStr(url, ".pdf")
    IFind2 = InStr(url, ".doc")
    IFind3 = InStr(url, ".xls")
    IFind4 = InStr(url, ".xlsx")
    IFind5 = InStr(url, ".ppt")
    If IFind = 0 And IFind2 = 0 And IFind3 = 0 And IFind4 = 0 And IFind5 = 0 Then
        result = 0
    End If
    urlVerify = result
End Function



Function clawResult(keywords As String, keyWordsType As String, companyName As String, companyLine As Long) As String
Dim ie, dmt, tb, i&, j&, a&, strx2 As String, ie2, dmt2, tb2, i2&
       
        For a = 0 To 4
       
        Set ie = CreateObject("InternetExplorer.Application")
        With ie
            .Visible = False
            .navigate "https://www.google.com.hk/search?q=" + keywords + "+%22+" + companyName + "%22&lr=lang_ja&newwindow=1&safe=strict&hl=zh-CN&as_qdr=all&tbs=lr:lang_1ja&ei=1LhIVKeUFc3W7Qb_oIGABQ&start=" + CStr(a) + "0&sa=N&biw=1920&bih=1016" '??§????3???§??3??
           
            Do Until .ReadyState = 4
                DoEvents
            Loop
            Set dmt = .document
            If TypeName(dmt) <> "AcroPDF" Then
                Set tb = dmt.all.tags("h3")
               
                For i = 0 To tb.Length - 1
                  strx = Split(tb.Item(i).innerHTML, "href=")
                  strx2 = Split(strx(1), """")(1)
                  Cells(companyLine * 50 + a * 10 + 2 + i, 1) = strx2
                  Cells(companyLine * 50 + a * 10 + 2 + i, 2) = companyName
                  Cells(companyLine * 50 + a * 10 + 2 + i, 3) = tb.Item(i).innertext
                  Cells(companyLine * 50 + a * 10 + 2 + i, 4) = keyWordsType
                  IFind = urlVerify(strx2)
                    If IFind = 0 Then
                           
                             Set ie2 = CreateObject("InternetExplorer.Application")
                                With ie2
                                .Visible = False
                                    .navigate strx2
                                    Do Until .ReadyState = 4 Or .busy = False
                                        DoEvents
                                    Loop

                                    Set dmt2 = .document
                                    If TypeName(dmt2) <> "AcroPDF" Then
                                        Set tb2 = dmt2.all.tags("p")
                                       
                                        For i2 = 0 To tb2.Length - 1
                                             strs2 = strs2 & vbCrLf & tb2.Item(i2).innertext
                                        Next
                                         Cells(companyLine * 50 + a * 10 + 2 + i, 5) = strs2
                                         strs2 = ""
                                    End If
                                End With
                    
                    End If
                    
                Next
            End If
        End With
       
        Next
       
        Shell ("taskkill /f /im IEXPLORE.exe")
       
        newHour = Hour(Now())
        newMinute = Minute(Now())
        newSecond = Second(Now()) + 5
        waitTime = TimeSerial(newHour, newMinute, newSecond)
        Application.Wait waitTime
        clawResult = ""
       
       

    End Function
分享到:
评论

相关推荐

    clawer_for_douban:Java的douban的clawer

    【标题】"clawer_for_douban:Java的douban的clawer" 提供的信息表明,这是一个使用Java编程语言编写的豆瓣(Douban)数据抓取工具,通常被称为网络爬虫(Web Crawler)。网络爬虫是自动化地从互联网上获取大量信息的...

    用例统计

    Project Web App 参见 在本地运行Tracker Web应用 npm start 在开发模式下运行应用程序。 打开在浏览器中查看。 如果进行编辑,页面将... node ./src/clawer.js并将生成一个data.json5 提交对生成的data.json5请求

    About_Node:学习笔记:有关nodejs的一些示例和摘要

    1.clawer:nodejs相关的爬虫总结 2.require_way:nodejs模块的加载方式 3.clawer_github_stars:nodejs爬取github项目star数 4.upload:nodejs文件上传服务器 5.quickrun: 用进程模块从命令行快速启动应用 6....

    算法基础+算法强化+算法系统提升视频.zip

    网盘文件永久链接 九章算法;目录中文件数:10个 2,九章算法基础班;目录中文件数:20个 ...clawer2 crawler1 jiuzhang_mapreduce_1 jiuzhang_mapreduce_2 lbs1.wmv lbs2.wmv 九章系统设计_bigtable .....

    ptt_data_parse

    PTT Watcher一个分析ptt web版的爬虫主要爬web 版ptt来找热门文章以讨论程度做文章排行(后来加上fb分享做判定) web clawer : python3 (主要为selenium)

    Java视觉中国图片爬虫jar包

    5. **配置文件管理**:`clawer.properties`是配置文件,用于存储爬虫的运行参数,如关键词、下载路径、请求间隔等。用户可以通过修改这个文件来定制爬虫的行为。 6. **批处理脚本**:`start.bat`是一个批处理脚本,...

    Python开发-备份豆瓣计划

    8. 版本控制:文件名`JimSunJing-douban_clawer-50c4f38`暗示了使用了版本控制系统,可能是Git,`50c4f38`是一个Git提交哈希,意味着项目源码有版本历史,便于团队协作和代码追踪。 在实际项目中,可能还需要考虑...

Global site tag (gtag.js) - Google Analytics