網路爬蟲
在新浪财经上抓取工商银行每天的股价:
抓取时间
library(RCurl) myheader<-c( "User-Agent"="Mozilla/5.0(Windows;U;Windows NT 5.1; zh-CN; rv:1.9.1.6)", "Accept"="text/html.application/xhtml+xml,application/xml;q=0.9,/;q=0.8", "Accept-Language"="en-us", "Connection"="keep-alive", "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7") temp=getURL("http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/601398.phtml") k=strsplit(temp,"\r\n")[[1]] timeadr=k[grep("< a target='_blank' href='",k)+1] time=substring(timeadr,4,13)```
抓取股价
fpriceadr=k[grep("< a target='_blank' href='",k)+3] fprice=gregexpr(">\d+",fpriceadr) plist<-list() for(i in 1:50) {tempp=fprice[[1]] fprices=substring(fpriceadr[i],tempp+1,tempp+attr(tempp,'match.length')+3) plist[i]<-fprices }
在大众点评上抓取理发店地址
library(RCurl) library(XML) library(plyr) myheader<-c( "User-Agent"="Mozilla/5.0(Windows;U;Windows NT 5.1; zh-CN; rv:1.9.1.6)", "Accept"="text/html.application/xhtml+xml,application/xml;q=0.9,/;q=0.8", "Accept-Language"="en-us", "Connection"="keep-alive", "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7")
for(i in 1:3) {temp=getURL(urllist[i],httpheader=myheader,encoding="UTF-8") k=htmlParse(temp) getNodeSet(k,'//span[@class="addr"]') t=sapply(getNodeSet(k,'//span[@class="addr"]'),xmlValue) t<-laply(as.list(t),function(x){unlist(strsplit(x,"\n"))}) adr<-c(adr,t) }