網路爬蟲

在新浪财经上抓取工商银行每天的股价:

抓取时间

library(RCurl)
myheader<-c(
  "User-Agent"="Mozilla/5.0(Windows;U;Windows NT 5.1; zh-CN; rv:1.9.1.6)",
  "Accept"="text/html.application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
  "Accept-Language"="en-us",
  "Connection"="keep-alive",
  "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7")
temp=getURL("http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/601398.phtml")
k=strsplit(temp,"\r\n")[[1]]
timeadr=k[grep("< a target='_blank' href='",k)+1]
time=substring(timeadr,4,13)```

抓取股价

fpriceadr=k[grep("< a target='_blank' href='",k)+3]
fprice=gregexpr(">\d+",fpriceadr)
plist<-list()
for(i in 1:50)
{tempp=fprice[[1]]
fprices=substring(fpriceadr[i],tempp+1,tempp+attr(tempp,'match.length')+3)
  plist[i]<-fprices
  }

在大众点评上抓取理发店地址

library(RCurl)
library(XML)
library(plyr)
myheader<-c(
  "User-Agent"="Mozilla/5.0(Windows;U;Windows NT 5.1; zh-CN; rv:1.9.1.6)",
  "Accept"="text/html.application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
  "Accept-Language"="en-us",
  "Connection"="keep-alive",
  "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7")

for(i in 1:3)
{temp=getURL(urllist[i],httpheader=myheader,encoding="UTF-8")
k=htmlParse(temp)
getNodeSet(k,'//span[@class="addr"]')
t=sapply(getNodeSet(k,'//span[@class="addr"]'),xmlValue)
t<-laply(as.list(t),function(x){unlist(strsplit(x,"\n"))})
adr<-c(adr,t)
}

results matching ""

    No results matching ""