R code _ 抓資料用 View Larger Image Your Content Goes Here 有寫了一些code。 想抓一些資料在excel整理一下。看會不會比較有幫助。 看可以比價尿布之類的 Copy to Clipboardlibrary(httr) library(Rcrawler) library(jsonlite) library(methods) library(dplyr) library(anytime) library(xml2) library(rvest) library(curl) library(stringr) #三家不同的比較網頁 #geturl = "https://www.findprice.com.tw/g/%E7%B4%99%E5%B0%BF%E5%B8%83/?i=1" #geturl = "https://feebee.com.tw/s/?q=%E5%B0%BF%E5%B8%83" #geturl = "https://biggo.com.tw/s/%E5%B0%BF%E5%B8%83/?p=1" #geturl = "https://www.pinkoi.com/search?page=360&q=%E7%9A%AE%E9%9D%A9%E6%89%8B%E4%BD%9C" #init data #target_site <- c('www.findprice.com.tw','feebee.com.tw','biggo.com.tw') ##關鍵字尋找的方式取得相關資料 #target_site <- c('www.findprice.com.tw') target_site <- c('www.findprice.com.tw','feebee.com.tw','biggo.com.tw') keyword = c("除溼機") npage = 250 ######執行程式行 sapply(1:3, function(x) { export_csv(target_site[x],keyword,npage) }) combin_all(keyword) #完成檔案合併 ######執行程式行 #目標網站,關鍵字,抓取頁數 export_csv <- function (target_site, keyword, npage) { switch( target_site, www.findprice.com.tw = { # 設定findprice location = paste0("/home/rstudio/rcrawler/", target_site) setwd(location) #設定網址路徑 geturl = paste0("https://", target_site, "/g/", keyword, "/?i=") geturl = URLencode(geturl) ignoreUrlParams = c('[Supplier]', '[merchantlist]') dataUrlfilter = c(paste0('[/g/', URLencode(keyword), '/i=]')) crawlUrlfilter = c(paste0('[/g/', URLencode(keyword), '/i=]')) #XPATH設定 產品名稱.xpath1 = "//a[@class='ga']/div" 產品網址.xpath1 = "//td[@valign='top']/a[@class='ga']/@href" 產品價格範圍.xpath1 = "//span[@class='rec-price']/text()" 產品名稱.xpath2 = "//a[@class='ga']/text()" 產品網址.xpath2 = "//td[@valign='middle']/a[@class='ga']/@href" 產品來源.xpath2 = "//span/font[@style='color:#009900']/text()" 產品價格.xpath2 = "//td[@class='tdPrice']/text()" }, feebee.com.tw = { # 設定feebee location = paste0("/home/rstudio/rcrawler/", target_site) setwd(location) #設定網址路徑 geturl = paste0("https://", target_site, "/s/", keyword, "/?sort=d&mode=l&ptab=0&page=") geturl = URLencode(geturl) ignoreUrlParams = c('[Supplier]', '[merchantlist]') dataUrlfilter = c(paste0('[/s/', URLencode(keyword), '/?mode=l&ptab=0&page=]')) crawlUrlfilter = c(paste0('[/s/', URLencode(keyword), '/?mode=l&ptab=0&page=]')) #XPATH設定 產品名稱.xpath1 = "//a[@class='product_link']/h3" 產品網址.xpath1 = "//a[@class='btn_comparison vertical_middle']/@href" 產品價格範圍.xpath1 = "//div[@class='product_group_price price ellipsis xlarge']" 產品名稱.xpath2 = "//a[@class='items_link']/h3/text()" 產品網址.xpath2 = "//li[@class='action']/a[@class='btn_buy items_link']/@href" 產品來源.xpath2 = "//div[@class='pure-u promote_info vertical_middle small']/span[@class='shop']/img/@alt" 產品價格.xpath2 = "//li[@class='price ellipsis xlarge']/text()" }, biggo.com.tw = { # 設定biggo location = paste0("/home/rstudio/rcrawler/", target_site) setwd(location) #設定網址路徑 geturl = paste0("https://", target_site, "/s/", keyword, "/?p=") geturl = URLencode(geturl) ignoreUrlParams = c('[Supplier]', '[merchantlist]') dataUrlfilter = c(paste0('[/s/', URLencode(keyword), '/?p=]')) crawlUrlfilter = c(paste0('[/s/', URLencode(keyword), '/?p=]')) #XPATH設定 產品名稱.xpath1 = "//a[@data-id='biggo_product']/div[@class='titlemax']" 產品網址.xpath1 = "//a[@data-id='biggo_product']/@href" 產品價格範圍.xpath1 = "//div[contains(@id,'itemlist_') and not(contains(@id,'itemlist_-1'))]/div/div/div/div/span/strong/text()" 產品名稱.xpath2 = "//div[contains(@id,'itemlist_') and not(contains(@id,'itemlist_-1'))]/*//h2/a/@data-title" 產品網址.xpath2 = "//div[contains(@id,'itemlist_') and not(contains(@id,'itemlist_-1'))]/*//h2/a/@data-href" 產品來源.xpath2 = "//div[contains(@id,'itemlist_') and not(contains(@id,'itemlist_-1'))]/*//h2/a/@data-id" 產品價格.xpath2 = "//div[contains(@id,'itemlist_') and not(contains(@id,'itemlist_-1'))]/*//h2/a/@data-price" }, { print('default') } ) ##依照門店方式取得資料 urllist <- sapply(1:npage, function(x) { paste0(geturl, x) }) Rcrawler( Website = urllist, no_cores = 1, no_conn = npage, MaxDepth = 1, RequestsDelay = 5, Obeyrobots = FALSE, #DIR = "www.findprice.com.tw/" Useragent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", Encod = 'utf-8', ignoreUrlParams = ignoreUrlParams, dataUrlfilter = dataUrlfilter, crawlUrlfilter = crawlUrlfilter, Vbrowser = FALSE ) getProjectList <- ListProjects() l = length(getProjectList) 產品清單 = data.frame() # xpath取得文字text() / 第N個位子position()=1 若還有子項目時 # tag名稱/ 屬性 #LIST PAGE上的資料 #讀一筆 寫一筆 #l=1 #x=2 #getProjectList[l] for (l in l:l) { ##findprice產品資料 DataHTML <- sapply(getProjectList[l], function(x) { LoadHTMLFiles(x, type = "list") }) #前幾個頁面 有最高最低價格頁面 for (x in 1:length(DataHTML)) { 產品名稱 <- ContentScraper( HTmlText = DataHTML[[x]], XpathPatterns = 產品名稱.xpath1, ManyPerPattern = TRUE ) 產品網址 <- ContentScraper( HTmlText = DataHTML[[x]], XpathPatterns = 產品網址.xpath1, ManyPerPattern = TRUE ) 產品價格範圍 <- ContentScraper( HTmlText = DataHTML[[x]], XpathPatterns = 產品價格範圍.xpath1, ManyPerPattern = TRUE ) #處理價格範圍 正規化後取得最大最小價格 price <- gsub('[$, ]', '', unlist(產品價格範圍)) price_tmp <- strsplit(as.character(price), '[~|~]', fixed = FALSE) a <- as.data.frame(price_tmp, row.names = NULL) if (length(unlist(a)) > 1) { price_min <- data.frame(t(a), row.names = NULL)[1] price_max <- data.frame(t(a), row.names = NULL)[2] } if (length(unlist(a)) == 1) { price_min <- data.frame(t(a), row.names = NULL)[1] price_max <- data.frame(t(a), row.names = NULL)[1] } if (length(unlist(a)) == 0) { price_min <- NULL price_max <- NULL } if (length(a) != 0) { tmp = cbind( 產品名稱 = unlist(產品名稱), 產品網址 = gsub("[\n ]", "", unlist(產品網址)), 產品來源 = unlist(rep(list('0'), length(a))), #補滿空 price_min = (price_min), price_max = (price_max) ) colnames(tmp) <- c("產品名稱", "產品網址", "產品來源", "最小金額", "最大金額") 產品清單 = rbind(產品清單, tmp) } } #後面幾頁 for (x in 1:length(DataHTML)) { 產品名稱 <- ContentScraper( HTmlText = DataHTML[[x]], XpathPatterns = 產品名稱.xpath2, ManyPerPattern = TRUE ) 產品網址 <- ContentScraper( HTmlText = DataHTML[[x]], XpathPatterns = 產品網址.xpath2, ManyPerPattern = TRUE ) 產品來源 <- ContentScraper( HTmlText = DataHTML[[x]], XpathPatterns = 產品來源.xpath2, ManyPerPattern = TRUE ) 產品價格 <- ContentScraper( HTmlText = DataHTML[[x]], XpathPatterns = 產品價格.xpath2, ManyPerPattern = TRUE ) #處理價格範圍 正規化後取得最大最小價格 price <- gsub('[A-Za-z$, ]', '', unlist(產品價格)) if (length(price) != 0) { tmp = cbind( 產品名稱 = unlist(產品名稱), 產品網址 = gsub("[\n ]", "", unlist(產品網址)), 產品來源 = gsub("[\n ]", "", unlist(產品來源)), price = 0, price = (price) ) colnames(tmp) <- c("產品名稱", "產品網址", "產品來源", "最小金額", "最大金額") 產品清單 = rbind(產品清單, tmp) } } } rm(DataHTML) #gc() #.rs.restartR() #產品清單 %>% arrange(產品清單$產品名稱) 產品清單 = unique(產品清單) #去除重復的資料 整理 <- 產品清單[with(產品清單, order(產品名稱)),] #排序dataframe資料 #整理 = 產品清單[!grepl(c('寵物'), 產品清單$產品名稱),] #去除含有寵物字眼的產品 output_path <- '../data/' output_filename <- paste0(keyword, "-", target_site, "-", format(Sys.time(), "%m%d%H%M"), ".csv") write.csv(整理, file = paste0(output_path, output_filename)) del_dir(target_site,output_path,output_filename) } #write.xlsx(整理, file = paste0("../",output_filename,".xlsx")) #########刪除舊目錄資料 #1.如果輸出資料完成 #2.如果目錄位置正確,刪除下載的html資料 del_dir <- function (target_site,output_path,output_filename) { dirnames <- dir( path = ".", pattern = paste0(target_site), full.names = TRUE, include.dirs = TRUE ) if (getwd() == paste0("/home/rstudio/rcrawler/", target_site) && file.exists(paste0(output_path, output_filename))) { unlink(dirnames, recursive = TRUE) } } #整合csv檔 #取得所抓取關鍵字的列表 整合成一個csv檔 combin_all <- function (keyword) { location = paste0("/home/rstudio/rcrawler/") setwd(location) filenames <- list.files(path = "data", pattern = paste0(keyword), full.names = TRUE) All <- lapply(filenames, function(i) { read.csv( i, header = TRUE, col.names = c("產品名稱", "產品網址", "產品來源", "最小金額", "最大金額") ) #read.csv(i,col_names = FALSE) }) df <- do.call(rbind.data.frame, All) write.csv(df, paste0( "all/", format(Sys.time(), "%m%d%H%M"), "-", keyword, "-ALL.csv" ), row.names = FALSE) write.xlsx(df, file = paste0("all/",format(Sys.time(), "%m%d%H%M"),"-",keyword,"-ALL.xlsx")) #整合csv檔 } Views: 20 By mickey|2021-06-30T15:20:07+08:002019-05-23|R語言|0 Comments Share This Story, Choose Your Platform! FacebookTwitterRedditLinkedInWhatsAppTelegramTumblrPinterestVkXingEmail About the Author: mickey Related Posts 【R】Ubuntu + R+RSelenium 使用phantomjs Gallery 【R】Ubuntu + R+RSelenium 使用phantomjs 【R】getURL 若無指定encoding 會出現亂碼的問題 Gallery 【R】getURL 若無指定encoding 會出現亂碼的問題 【R Studio】Ubuntu 19.04 + R Studio + Webmin 安裝筆記 Gallery 【R Studio】Ubuntu 19.04 + R Studio + Webmin 安裝筆記 【R語言】如何一次比較多個數字是否一樣 Gallery 【R語言】如何一次比較多個數字是否一樣 Subscribe Notify of new follow-up comments new replies to my comments Label {} [+] Name* Email* Website Label {} [+] Name* Email* Website 0 Comments Oldest Newest Most Voted Inline Feedbacks View all comments