Sub clawData()
Dim companies() As String
Dim rowNum As Long
Dim keywords As String
Dim ff As Long
companyNum = Sheets("Sheet1").Range("g65536").End(xlUp).Row
keywordsGeneral = getGeneralKeyWordsGeneral()
keywordsEnvironment = getGeneralKeyWordsEnvironment()
keywordsSocial = getGeneralKeyWordsSocial()
keywordsGovernance = getGeneralKeyWordsGovernance()
companies() = getCompanyList()
For ff = 0 To companyNum Step 1
If companies()(ff) <> "" Then
keywordsGeneral = clawResult(CStr(keywordsGeneral), "General", CStr(companies()(ff)), ff * 4)
keywordsGeneral = clawResult(CStr(keywordsEnvironment), "Environment", CStr(companies()(ff)), ff * 4 + 1)
keywordsGeneral = clawResult(CStr(keywordsSocial), "Social", CStr(companies()(ff)), ff * 4 + 2)
keywordsGeneral = clawResult(CStr(keywordsGovernance), "Governance", CStr(companies()(ff)), ff * 4 + 3)
End If
Next
Shell ("taskkill /f /im IEXPLORE.exe")
4
End Sub
Function GetChs(strInput As String) As String
Dim regEx As Object
Set regEx = CreateObject("VBSCRIPT.REGEXP")
regEx.Pattern = "[^\u4e00-\u9fa5]"
regEx.IgnoreCase = True
regEx.Global = True
GetChs = regEx.Replace(strInput, "")
Set regEx = Nothing
End Function
Function getGeneralKeyWordsGeneral() As String
Dim d, a, c As Variant
Dim strT
Dim strs As String
Dim array1
array1 = Array()
Set d = CreateObject("Scripting.Dictionary")
For i = 2 To Sheets("Sheet1").Range("h65536").End(xlUp).Row Step 1
strT = CStr(Sheets("Sheet1").Cells(i,
)
If strT <> "" Then
d.Add i, strT
End If
Next
For ii = 2 To d.Count + 1 Step 1
If ii = d.Count + 1 Then
strs = strs + d.Item(ii)
Else
strs = strs + d.Item(ii) + "+OR+"
End If
Next
getGeneralKeyWordsGeneral = strs
End Function
Function getGeneralKeyWordsEnvironment() As String
Dim d, a, c As Variant
Dim strT
Dim strs As String
Dim array1
array1 = Array()
Set d = CreateObject("Scripting.Dictionary")
For i = 2 To Sheets("Sheet1").Range("i65536").End(xlUp).Row Step 1
strT = CStr(Sheets("Sheet1").Cells(i, 9))
If strT <> "" Then
d.Add i, strT
End If
Next
For ii = 2 To d.Count + 1 Step 1
If ii = d.Count + 1 Then
strs = strs + d.Item(ii)
Else
strs = strs + d.Item(ii) + "+OR+"
End If
Next
getGeneralKeyWordsEnvironment = strs
End Function
Function getGeneralKeyWordsSocial() As String
Dim d, a, c As Variant
Dim strT
Dim strs As String
Dim array1
array1 = Array()
Set d = CreateObject("Scripting.Dictionary")
For i = 2 To Sheets("Sheet1").Range("j65536").End(xlUp).Row Step 1
strT = CStr(Sheets("Sheet1").Cells(i, 10))
If strT <> "" Then
d.Add i, strT
End If
Next
For ii = 2 To d.Count + 1 Step 1
If ii = d.Count + 1 Then
strs = strs + d.Item(ii)
Else
strs = strs + d.Item(ii) + "+OR+"
End If
Next
getGeneralKeyWordsSocial = strs
End Function
Function getGeneralKeyWordsGovernance() As String
Dim d, a, c As Variant
Dim strT
Dim strs As String
Dim array1
array1 = Array()
Set d = CreateObject("Scripting.Dictionary")
For i = 2 To Sheets("Sheet1").Range("k65536").End(xlUp).Row Step 1
strT = CStr(Sheets("Sheet1").Cells(i, 11))
If strT <> "" Then
d.Add i, strT
End If
Next
For ii = 2 To d.Count + 1 Step 1
If ii = d.Count + 1 Then
strs = strs + d.Item(ii)
Else
strs = strs + d.Item(ii) + "+OR+"
End If
Next
getGeneralKeyWordsGovernance = strs
End Function
Function getCompanyList() As String()
Dim d, a, c As Variant
Dim strT
Dim strs As String
Dim array1(100) As String
Set d = CreateObject("Scripting.Dictionary")
For i = 2 To Sheets("Sheet1").Range("g65536").End(xlUp).Row Step 1
'strT = GetChs(CStr(Sheets("Sheet1").Cells(i, 2)))
strT = CStr(Sheets("Sheet1").Cells(i, 7))
If strT <> "" Then
array1(i - 2) = strT
End If
Next
getCompanyList = array1()
End Function
Function urlVerify(url As String) As Long
Dim result As Long
result = 1
IFind = InStr(url, ".pdf")
IFind2 = InStr(url, ".doc")
IFind3 = InStr(url, ".xls")
IFind4 = InStr(url, ".xlsx")
IFind5 = InStr(url, ".ppt")
If IFind = 0 And IFind2 = 0 And IFind3 = 0 And IFind4 = 0 And IFind5 = 0 Then
result = 0
End If
urlVerify = result
End Function
Function clawResult(keywords As String, keyWordsType As String, companyName As String, companyLine As Long) As String
Dim ie, dmt, tb, i&, j&, a&, strx2 As String, ie2, dmt2, tb2, i2&
For a = 0 To 4
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = False
.navigate "https://www.google.com.hk/search?q=" + keywords + "+%22+" + companyName + "%22&lr=lang_ja&newwindow=1&safe=strict&hl=zh-CN&as_qdr=all&tbs=lr:lang_1ja&ei=1LhIVKeUFc3W7Qb_oIGABQ&start=" + CStr(a) + "0&sa=N&biw=1920&bih=1016" '??§????3???§??3??
Do Until .ReadyState = 4
DoEvents
Loop
Set dmt = .document
If TypeName(dmt) <> "AcroPDF" Then
Set tb = dmt.all.tags("h3")
For i = 0 To tb.Length - 1
strx = Split(tb.Item(i).innerHTML, "href=")
strx2 = Split(strx(1), """")(1)
Cells(companyLine * 50 + a * 10 + 2 + i, 1) = strx2
Cells(companyLine * 50 + a * 10 + 2 + i, 2) = companyName
Cells(companyLine * 50 + a * 10 + 2 + i, 3) = tb.Item(i).innertext
Cells(companyLine * 50 + a * 10 + 2 + i, 4) = keyWordsType
IFind = urlVerify(strx2)
If IFind = 0 Then
Set ie2 = CreateObject("InternetExplorer.Application")
With ie2
.Visible = False
.navigate strx2
Do Until .ReadyState = 4 Or .busy = False
DoEvents
Loop
Set dmt2 = .document
If TypeName(dmt2) <> "AcroPDF" Then
Set tb2 = dmt2.all.tags("p")
For i2 = 0 To tb2.Length - 1
strs2 = strs2 & vbCrLf & tb2.Item(i2).innertext
Next
Cells(companyLine * 50 + a * 10 + 2 + i, 5) = strs2
strs2 = ""
End If
End With
End If
Next
End If
End With
Next
Shell ("taskkill /f /im IEXPLORE.exe")
newHour = Hour(Now())
newMinute = Minute(Now())
newSecond = Second(Now()) + 5
waitTime = TimeSerial(newHour, newMinute, newSecond)
Application.Wait waitTime
clawResult = ""
End Function
分享到:
相关推荐
【标题】"clawer_for_douban:Java的douban的clawer" 提供的信息表明,这是一个使用Java编程语言编写的豆瓣(Douban)数据抓取工具,通常被称为网络爬虫(Web Crawler)。网络爬虫是自动化地从互联网上获取大量信息的...
Project Web App 参见 在本地运行Tracker Web应用 npm start 在开发模式下运行应用程序。 打开在浏览器中查看。 如果进行编辑,页面将... node ./src/clawer.js并将生成一个data.json5 提交对生成的data.json5请求
1.clawer:nodejs相关的爬虫总结 2.require_way:nodejs模块的加载方式 3.clawer_github_stars:nodejs爬取github项目star数 4.upload:nodejs文件上传服务器 5.quickrun: 用进程模块从命令行快速启动应用 6....
网盘文件永久链接 九章算法;目录中文件数:10个 2,九章算法基础班;目录中文件数:20个 ...clawer2 crawler1 jiuzhang_mapreduce_1 jiuzhang_mapreduce_2 lbs1.wmv lbs2.wmv 九章系统设计_bigtable .....
PTT Watcher一个分析ptt web版的爬虫主要爬web 版ptt来找热门文章以讨论程度做文章排行(后来加上fb分享做判定) web clawer : python3 (主要为selenium)
5. **配置文件管理**:`clawer.properties`是配置文件,用于存储爬虫的运行参数,如关键词、下载路径、请求间隔等。用户可以通过修改这个文件来定制爬虫的行为。 6. **批处理脚本**:`start.bat`是一个批处理脚本,...
8. 版本控制:文件名`JimSunJing-douban_clawer-50c4f38`暗示了使用了版本控制系统,可能是Git,`50c4f38`是一个Git提交哈希,意味着项目源码有版本历史,便于团队协作和代码追踪。 在实际项目中,可能还需要考虑...