R code _ 抓資料用

Your Content Goes Here

有寫了一些code。

想抓一些資料在excel整理一下。看會不會比較有幫助。

看可以比價尿布之類的

Copy to Clipboard

library(httr)
library(Rcrawler)
library(jsonlite)
library(methods)
library(dplyr)
library(anytime)
library(xml2)
library(rvest)
library(curl)
library(stringr)

#三家不同的比較網頁
#geturl = "https://www.findprice.com.tw/g/%E7%B4%99%E5%B0%BF%E5%B8%83/?i=1"
#geturl = "https://feebee.com.tw/s/?q=%E5%B0%BF%E5%B8%83"
#geturl = "https://biggo.com.tw/s/%E5%B0%BF%E5%B8%83/?p=1"
#geturl = "https://www.pinkoi.com/search?page=360&q=%E7%9A%AE%E9%9D%A9%E6%89%8B%E4%BD%9C"

#init data
#target_site <- c('www.findprice.com.tw','feebee.com.tw','biggo.com.tw')

##關鍵字尋找的方式取得相關資料
#target_site <- c('www.findprice.com.tw')
target_site <- c('www.findprice.com.tw','feebee.com.tw','biggo.com.tw')
keyword = c("除溼機")
npage = 250

######執行程式行
sapply(1:3, function(x) {
  export_csv(target_site[x],keyword,npage)
})
combin_all(keyword) #完成檔案合併
######執行程式行

#目標網站,關鍵字,抓取頁數
export_csv <- function (target_site, keyword, npage) {
  switch(
    target_site,
    www.findprice.com.tw = {
      # 設定findprice
      location = paste0("/home/rstudio/rcrawler/", target_site)
      setwd(location)
      #設定網址路徑
      geturl = paste0("https://", target_site, "/g/", keyword, "/?i=")
      geturl = URLencode(geturl)
      
      ignoreUrlParams = c('[Supplier]', '[merchantlist]')
      dataUrlfilter = c(paste0('[/g/', URLencode(keyword), '/i=]'))
      crawlUrlfilter = c(paste0('[/g/', URLencode(keyword), '/i=]'))
      #XPATH設定
      產品名稱.xpath1 = "//a[@class='ga']/div"
      產品網址.xpath1 = "//td[@valign='top']/a[@class='ga']/@href"
      產品價格範圍.xpath1 = "//span[@class='rec-price']/text()"
      
      產品名稱.xpath2 = "//a[@class='ga']/text()"
      產品網址.xpath2 = "//td[@valign='middle']/a[@class='ga']/@href"
      產品來源.xpath2 = "//span/font[@style='color:#009900']/text()"
      產品價格.xpath2 = "//td[@class='tdPrice']/text()"
      
    },
    feebee.com.tw = {
      # 設定feebee
      location = paste0("/home/rstudio/rcrawler/", target_site)
      setwd(location)
      #設定網址路徑
      geturl = paste0("https://",
                      target_site,
                      "/s/",
                      keyword,
                      "/?sort=d&mode=l&ptab=0&page=")
      geturl = URLencode(geturl)
      
      ignoreUrlParams = c('[Supplier]', '[merchantlist]')
      dataUrlfilter = c(paste0('[/s/', URLencode(keyword), '/?mode=l&ptab=0&page=]'))
      crawlUrlfilter = c(paste0('[/s/', URLencode(keyword), '/?mode=l&ptab=0&page=]'))
      #XPATH設定
      產品名稱.xpath1 = "//a[@class='product_link']/h3"
      產品網址.xpath1 = "//a[@class='btn_comparison vertical_middle']/@href"
      產品價格範圍.xpath1 = "//div[@class='product_group_price price ellipsis xlarge']"
      
      產品名稱.xpath2 = "//a[@class='items_link']/h3/text()"
      產品網址.xpath2 = "//li[@class='action']/a[@class='btn_buy items_link']/@href"
      產品來源.xpath2 = "//div[@class='pure-u promote_info vertical_middle small']/span[@class='shop']/img/@alt"
      產品價格.xpath2 = "//li[@class='price ellipsis xlarge']/text()"
    },
    biggo.com.tw = {
      # 設定biggo
      location = paste0("/home/rstudio/rcrawler/", target_site)
      setwd(location)
      #設定網址路徑
      geturl = paste0("https://", target_site, "/s/", keyword, "/?p=")
      geturl = URLencode(geturl)
      
      ignoreUrlParams = c('[Supplier]', '[merchantlist]')
      dataUrlfilter = c(paste0('[/s/', URLencode(keyword), '/?p=]'))
      crawlUrlfilter = c(paste0('[/s/', URLencode(keyword), '/?p=]'))
      #XPATH設定
      產品名稱.xpath1 = "//a[@data-id='biggo_product']/div[@class='titlemax']"
      產品網址.xpath1 = "//a[@data-id='biggo_product']/@href"
      產品價格範圍.xpath1 = "//div[contains(@id,'itemlist_') and not(contains(@id,'itemlist_-1'))]/div/div/div/div/span/strong/text()"
      
      產品名稱.xpath2 = "//div[contains(@id,'itemlist_') and not(contains(@id,'itemlist_-1'))]/*//h2/a/@data-title"
      產品網址.xpath2 = "//div[contains(@id,'itemlist_') and not(contains(@id,'itemlist_-1'))]/*//h2/a/@data-href"
      產品來源.xpath2 = "//div[contains(@id,'itemlist_') and not(contains(@id,'itemlist_-1'))]/*//h2/a/@data-id"
      產品價格.xpath2 = "//div[contains(@id,'itemlist_') and not(contains(@id,'itemlist_-1'))]/*//h2/a/@data-price"
    },
    {
      print('default')
    }
  )
  
  ##依照門店方式取得資料
  urllist <- sapply(1:npage, function(x) {
    paste0(geturl, x)
  })
  
  Rcrawler(
    Website = urllist,
    no_cores = 1,
    no_conn = npage,
    MaxDepth = 1,
    RequestsDelay = 5,
    Obeyrobots = FALSE,
    #DIR = "www.findprice.com.tw/"
    Useragent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
    Encod = 'utf-8',
    ignoreUrlParams = ignoreUrlParams,
    dataUrlfilter = dataUrlfilter,
    crawlUrlfilter = crawlUrlfilter,
    Vbrowser = FALSE
  )
  
  getProjectList <- ListProjects()
  l = length(getProjectList)
  
  產品清單     = data.frame()
  # xpath取得文字text() / 第N個位子position()=1 若還有子項目時
  # tag名稱/ 屬性
  #LIST PAGE上的資料
  #讀一筆 寫一筆
  #l=1
  #x=2
  #getProjectList[l]
  for (l in l:l) {
    ##findprice產品資料
    
    DataHTML <-
      sapply(getProjectList[l], function(x) {
        LoadHTMLFiles(x, type = "list")
      })
    
    #前幾個頁面 有最高最低價格頁面
    for (x in 1:length(DataHTML)) {
      
      產品名稱      <-
        ContentScraper(
          HTmlText = DataHTML[[x]],
          XpathPatterns =    產品名稱.xpath1,
          ManyPerPattern = TRUE
        )
      產品網址      <-
        ContentScraper(
          HTmlText = DataHTML[[x]],
          XpathPatterns =     產品網址.xpath1,
          ManyPerPattern = TRUE
        )
      
      產品價格範圍      <-
        ContentScraper(
          HTmlText = DataHTML[[x]],
          XpathPatterns =     產品價格範圍.xpath1,
          ManyPerPattern = TRUE
        )
      #處理價格範圍 正規化後取得最大最小價格
      price <- gsub('[$, ]', '', unlist(產品價格範圍))
      price_tmp <-
        strsplit(as.character(price), '[～|~]', fixed = FALSE)
      a <- as.data.frame(price_tmp, row.names = NULL)
      if (length(unlist(a)) > 1) {
        price_min <- data.frame(t(a), row.names = NULL)[1]
        price_max <- data.frame(t(a), row.names = NULL)[2]
      }
      if (length(unlist(a)) == 1) {
        price_min <- data.frame(t(a), row.names = NULL)[1]
        price_max <- data.frame(t(a), row.names = NULL)[1]
      }
      if (length(unlist(a)) == 0) {
        price_min <- NULL
        price_max <- NULL
      }
      
      if (length(a) != 0) {
        tmp = cbind(
          產品名稱      = unlist(產品名稱),
          產品網址      = gsub("[\n ]", "", unlist(產品網址)),
          產品來源      = unlist(rep(list('0'), length(a))),
          #補滿空
          price_min = (price_min),
          price_max = (price_max)
        )
        colnames(tmp) <- c("產品名稱", "產品網址", "產品來源", "最小金額", "最大金額")
        產品清單      = rbind(產品清單, tmp)
      }
    }
    #後面幾頁
    for (x in 1:length(DataHTML)) {
      
      產品名稱      <-
        ContentScraper(
          HTmlText = DataHTML[[x]],
          XpathPatterns =    產品名稱.xpath2,
          ManyPerPattern = TRUE
        )
      產品網址      <-
        ContentScraper(
          HTmlText = DataHTML[[x]],
          XpathPatterns =    產品網址.xpath2,
          ManyPerPattern = TRUE
        )
      產品來源      <-
        ContentScraper(
          HTmlText = DataHTML[[x]],
          XpathPatterns =    產品來源.xpath2,
          ManyPerPattern = TRUE
        )
      產品價格      <-
        ContentScraper(
          HTmlText = DataHTML[[x]],
          XpathPatterns =    產品價格.xpath2,
          ManyPerPattern = TRUE
        )
      
      #處理價格範圍 正規化後取得最大最小價格
      price <- gsub('[A-Za-z$, ]', '', unlist(產品價格))
      if (length(price) != 0) {
        tmp = cbind(
          產品名稱      = unlist(產品名稱),
          產品網址      = gsub("[\n ]", "", unlist(產品網址)),
          產品來源      = gsub("[\n ]", "", unlist(產品來源)),
          price = 0,
          price = (price)
        )
        colnames(tmp) <- c("產品名稱", "產品網址", "產品來源", "最小金額", "最大金額")
        產品清單      = rbind(產品清單, tmp)
      }
    }
  }
  
  rm(DataHTML)
  #gc()
  #.rs.restartR()
  
  #產品清單 %>% arrange(產品清單$產品名稱)
  產品清單     = unique(產品清單) #去除重復的資料
  整理     <-      產品清單[with(產品清單, order(產品名稱)),] #排序dataframe資料
  #整理   =   產品清單[!grepl(c('寵物'),  產品清單$產品名稱),] #去除含有寵物字眼的產品
  
  output_path <- '../data/'
  output_filename <- paste0(keyword,
                            "-",
                            target_site,
                            "-",
                            format(Sys.time(), "%m%d%H%M"),
                            ".csv")
  
  write.csv(整理, file = paste0(output_path, output_filename))
  del_dir(target_site,output_path,output_filename)
}
#write.xlsx(整理, file = paste0("../",output_filename,".xlsx"))

#########刪除舊目錄資料
#1.如果輸出資料完成
#2.如果目錄位置正確，刪除下載的html資料
del_dir <- function (target_site,output_path,output_filename) {

dirnames <-
    dir(
      path = ".",
      pattern = paste0(target_site),
      full.names = TRUE,
      include.dirs = TRUE
    )
  if (getwd() == paste0("/home/rstudio/rcrawler/", target_site) &&
      file.exists(paste0(output_path, output_filename)))
  {
    unlink(dirnames, recursive = TRUE)
  }
}

#整合csv檔
#取得所抓取關鍵字的列表 整合成一個csv檔
combin_all <- function (keyword) {

location = paste0("/home/rstudio/rcrawler/")
  setwd(location)
  
  filenames <-
    list.files(path = "data",
               pattern = paste0(keyword),
               full.names = TRUE)
  
  
  All <- lapply(filenames, function(i) {
    read.csv(
      i,
      header = TRUE,
      col.names = c("產品名稱", "產品網址", "產品來源", "最小金額", "最大金額")
    )
    #read.csv(i,col_names = FALSE)
  })
  
  df <- do.call(rbind.data.frame, All)
  write.csv(df,
            paste0(
              "all/",
              format(Sys.time(), "%m%d%H%M"),
              "-",
              keyword,
              "-ALL.csv"
            ),
            row.names = FALSE)
  write.xlsx(df, file = paste0("all/",format(Sys.time(), "%m%d%H%M"),"-",keyword,"-ALL.xlsx"))
  #整合csv檔
}

Your Content Goes Here

About the Author: mickey

【R】Ubuntu + R+RSelenium 使用phantomjs

【R】getURL 若無指定encoding 會出現亂碼的問題

【R Studio】Ubuntu 19.04 + R Studio + Webmin 安裝筆記

【R語言】如何一次比較多個數字是否一樣

R code _ 抓資料用

Your Content Goes Here

Share This Story, Choose Your Platform!

About the Author: mickey

Related Posts

【R】Ubuntu + R+RSelenium 使用phantomjs

【R】getURL 若無指定encoding 會出現亂碼的問題

【R Studio】Ubuntu 19.04 + R Studio + Webmin 安裝筆記

【R語言】如何一次比較多個數字是否一樣