R语言基于Rselenium模拟浏览器抓取DatabaseCommons数据-连载NO.04
A:关于代码的注释没有非常详细说明,但代码都是经过实际数据抓取反复使用的,有兴趣和需要可以搭建好环境后尝试
B:针对抓取效率没有进行优化,在时间上有要求的需要自行调整判断和增加逻辑
1、alert弹窗处理
# 主动检查是否有alert弹窗,如果有则关闭
handle_alert_if_exists <- function(remdr) {tryCatch({alert_text <- remdr$getAlertText()[[1]]cat("⚠️ 捕获弹窗:", alert_text, "\n")remdr$dismissAlert()Sys.sleep(1)  # 等弹窗关闭后再继续}, error = function(e) {# 没弹窗时不处理})
}
2、抓取数据的关键词
#需要搜索的内容
search_content <- c("rna","cell","dna","protein")
spider_url_base <- "https://ngdc.cncb.ac.cn/databasecommons/#stat"
3、隐藏selenium的特征
# 隐藏Selenium特征的JavaScript代码
hide_automation_script <- "
Object.defineProperty(navigator, 'webdriver', {get: () => undefined
});
window.chrome = {runtime: {},// etc.
};
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (parameters.name === 'notifications' ?Promise.resolve({ state: Notification.permission }) :originalQuery(parameters)
);
"4、启动selenium及拉起浏览器
启动selenium服务system("java -jar \"你的文件位置/selenium-server-standalone-3.141.59.jar\"  -port 4449",wait = FALSE,invisible = FALSE)remdr <- remoteDriver(browserName ="firefox",# remoteServerAddr = "127.0.0.1",version='0.36.0',# extraCapabilities = list("moz:firefoxOptions" = list()),port=4449L)
#cmd中查看端口占用netstat -ano
#powershell中查看端口是否占用Get-NetTCPConnection | Select-Object LocalAddress, LocalPort, State, OwningProcess | Sort-Object LocalPort
remdr$open()5、数据抓取及保存
A:这部分涉及到2层循环,1、所有关键词的外循环,2、当前关键词的所有页数都内循环
B:有涉及到翻页按钮的操作,是否抓取至最后页码的判断
# 导航到目标网页
ptm <- proc.time()#记录消耗时间
remdr$navigate(spider_url_base )#注意当前网页跳转需要消耗的时间
proc.time() - ptm
# 在每次请求前执行JavaScript
remdr$executeScript(script = hide_automation_script)#准备好存储数据的表
bio_database_all <- data.frame()
for (i in seq_along(search_content)) {Sys.sleep(runif(1, 5, 10))# 定位搜索框并输入内容search_box <- remdr$findElement(using = "xpath", value = "//input[@id='q']") # 替换为实际的搜索框XPathSys.sleep(runif(1, 5, 10))search_box$sendKeysToElement(list(search_content[i])) # 替换为实际的搜索查询# 定位搜索按钮并点击search_button <- remdr$findElement(using = "xpath", value = "//*[@id='search']") # 替换为实际的搜索按钮XPathSys.sleep(runif(1, 3, 6))#页面加载,不然获取的页数可能有问题search_button$clickElement()# 打印进度cat(sprintf("Progress: %s",str_c( search_content[],"的数据库信息已开始提取,当前时间:",Sys.time())),sep='\n')#内循环的数据存储都是临时,如果不清空就累积bio_database_eachword<- data.frame()loop_times=0#初始循环次数# 设置每1页显示数据条数Sys.sleep(runif(1, 6, 9))select_pagenum_per <- remdr$findElement(using = "xpath", value = "//div[@class='selection ui dropdown']")select_pagenum_per$clickElement()Sys.sleep(runif(1, 5, 8))#页面加载,不然设置的条数可能有问题select_pagenum <- remdr$findElement(using = "xpath", value = "//div[@class='menu transition visible']//div[@data-value='20']")select_pagenum$clickElement()Sys.sleep(runif(3, 5, 7))# 4. 重新获取 HTML 内容(非常重要)page_html_source_new <- remdr$getPageSource()[[1]]page_html_content_new <- rvest::read_html(page_html_source_new)page_num_max <- page_html_content_new   %>%html_nodes(xpath = "//a[@class='paginate_button item ' and @data-dt-idx='7']") %>%html_text()print(page_num_max)#当前关键词数据抓取循环repeat{ptm <- proc.time()#记录消耗时间Sys.sleep(runif(1, 3, 6))# 获取完整HTML内容page_html_source <- remdr$getPageSource()[[1]]page_html_content <- rvest::read_html(page_html_source)#数据部分database_name=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[1]")%>%html_text()database_link_intro=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[1]/a[1]")%>%html_attr('href')database_link=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[1]/a[2]")%>%html_attr('href')full_name=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[2]")%>%html_text()data_object=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[3]")%>%html_text()data_type=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[4]")%>%html_text()database_category=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[5]")%>%html_text()keyword=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[6]")%>%html_text()location=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[7]")%>%html_text()host_institution=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[8]")%>%html_text()founded_year=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[9]")%>%html_text()citation=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[10]")%>%html_text()z_index=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[11]")%>%html_text()description=page_html_content %>%html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[12]")%>%html_text()bio_database_pageeach=data.frame(database_name,database_link_intro,database_link,full_name,data_object,data_type,database_category,keyword,location,host_institution,founded_year,citation,z_index,description,# description_all,search_word=search_content[i])bio_database_eachword=bind_rows(bio_database_eachword,bio_database_pageeach)# print(description_all)#进度current_page=page_html_content %>%html_nodes(xpath = "//a[@class='paginate_button item active']")%>%html_text()cat(sprintf("Progress: %s",str_c( search_content[i],"的数据库信息已开始提取至第",current_page,"页,共",page_num_max,"页数据,当前时间:",Sys.time())),sep='\n')proc.time() - ptm# current_page_url=remdr$getCurrentUrl()[[1]]#获取当前页面的网址#当前页面下一页按钮情况current_page_nextbutton_back1= page_html_content   %>%html_node(xpath = "//a[contains(@class, 'paginate_button') and contains(@class, 'next')]") %>%html_attr("class")current_page_nextbutton_back2=page_html_content   %>%html_node(xpath = "//div[contains(@class, 'paginate_button') and contains(@class, 'next')]") %>%html_attr("class")current_page_nextbutton=if(is.na(current_page_nextbutton_back1)==TRUE){current_page_nextbutton_back2}else{current_page_nextbutton_back1}#是否跳出循环,当前是否到达最后一页loop_times=loop_times+1cat("当前页面按钮是否可以翻页",current_page_nextbutton,"第",loop_times,"页")# cat("翻页前后2个页面是否相同",current_page_nextbutton,"第",loop_times,"个",current_page_nextbutton==next_page_nextbutton,sep="\n")if(str_detect(current_page_nextbutton,'disabled')==TRUE){cat("已到达最后一页,下一页按钮不可点击","当前页按钮:",current_page_nextbutton,sep='\n')break}handle_alert_if_exists(remdr)#翻页下一页操作nextpage_button_xpath <- "//a[@class='paginate_button item next']"nextpage_button <- remdr$findElement(using = "xpath", value = "//a[@class='paginate_button item next']") # Sys.sleep(runif(1, 3, 6))nextpage_button$clickElement()#如果间隔很短点击可能翻页不能及时生效# 翻页后等待页面加载Sys.sleep(runif(1, 4, 7))}bio_database_all=bind_rows(bio_database_all,bio_database_eachword)Sys.sleep(runif(1, 5, 7))#清空搜索框内容# 替换为实际的搜索框XPath,但这个网页的搜索框又变化,搜索后有2个输入位置search_box <- remdr$findElement(using = "xpath", value = "//input[@id='q']")#这个定位的是小的搜索框# search_box <- remdr$findElement(using = "xpath", value = "//div[@class='ui action input']/input[@id='term']")Sys.sleep(runif(1, 3, 6))search_box$clearElement()#清除搜索框内容# remdr$close()
}
