search
尋找貓咪~QQ 地點 桃園市桃園區 Taoyuan , Taoyuan

當一個顏值很高的程序員是怎樣一番體驗?

同java碼農, 試著那個拿了8k贊的美女程序員 @小仙女Albee 的代碼跑了下,發現缺了兩個類..看了一圈評論都沒人發現這個問題 (果然重點不是程序員是顏值).

做了些小改動,寫了個能跑的代碼.

packagewebMagic;importlombok.extern.slf4j.Slf4j;importorg.apache.commons.lang.math.RandomUtils;importus.codecraft.webmagic.Site;importus.codecraft.webmagic.Spider;importus.codecraft.webmagic.processor.PageProcessor;importus.codecraft.webmagic.selector.JsonPathSelector;importjava.io.File;importjava.io.FileOutputStream;importjava.io.InputStream;importjava.io.OutputStream;importjava.net.URL;importjava.net.URLConnection;importjava.util.List;/** * Created by Albee on 2017/4/13. * Optimized by Stephen Cai on 2017/9/3 */@Slf4jpublicclassGetquestionUrlProcessorimplementsPageProcessor{// 設置編碼 ,超時時間,重試次數,privateSitesite=Site.me.setRetryTimes(10).setSleepTime(5000).setTimeOut(5000).addCookie("Domain""zhihu.com").addCookie("z_c0"你的知乎cookie).setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36");//問題的索引//https://www.zhihu.com/question/20902967privatestaticfinalStringURL_question="^https://www\\.zhihu\\.com/question/\\d+$";//https://www.zhihu.com/question/19647535/answer/110944270privatestaticfinalStringURL_answer="https://www\\.zhihu\\.com/question/\\d+/answer/\\d+";privatestaticStringquestionId="";@Overridepublicvoidprocess(us.codecraft.webmagic.Pagepage){//頁面為問題頁,則將答案鏈接循環加入Downloaderif(page.getUrl.regex(URL_question).match){inttotal=20;inttime=total/20;page.setCharset("UTF-8");for(inti=0;i<=time;i++){intoffset=i*20;intlimit=totali+1)*20?total:((i+1)*20-1);Stringurl="https://www.zhihu.com/api/v4/questions/"+questionId+"/answers?include=data%5B*%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset="+offset+"&limit="+limit+"&sort_by=default";page.addTargetRequest(url);page.getRequest.setCharset("UTF-8");}//某個具體答案詳情頁面,則獲取詳情信息 。}elseif(page.getUrl.regex(URL_answer).match){StringquestionTitle=page.getHtml.xpath("//h1[@class=QuestionHeader-title]/text").toString;Stringuser_Avatar=page.getHtml.xpath("//span[@class=AuthorInfo-avatarWrapper]//img/@src").toString;List<StringurlList=page.getHtml.xpath("//div[@class=RichContent-inner]//img/@src").all;StringfilePath=你的目錄;Stringtitle=questionTitle+"_"+user_Avatar;finalinti={1};urlList.forEach(url->{try{downloadPicture(urlfilePathInteger.toString(RandomUtils.nextInt)+url.substring(url.lastIndexOf("."),url.length));i[0]++;}catch(Exceptione){e.printStackTrace;}});}else{List<Stringid=newJsonPathSelector("$.data[*].id").selectList(page.getRawText);for(inti=0;i<id.size;i++){StringanswerUrl="https://www.zhihu.com/question/"+questionId+"/answer/"+id.get(i);page.addTargetRequest(answerUrl);}}}@OverridepublicSitegetSite{returnsite;}publicstaticvoiddownloadPicture(StringurlStringStringsavePathStringfilename)throwsException{// 構造URLURLurl=newURL(urlString);// 打開連接URLConnectioncon=url.openConnection;//設置請求超時為5scon.setConnectTimeout(5*1000);// 輸入流InputStreamis=con.getInputStream;// 1K的數據緩衝bytebs=newbyte[1024];// 讀取到的數據長度intlen;// 輸出的文件流Filesf=newFile(savePath+"/"+filename);// if(!sf.exists){// sf.mkdirs;// }OutputStreamos=newFileOutputStream(sf);// 開始讀取while((len=is.read(bs))!=-1){os.write(bs0len);}// 完畢,關閉所有鏈接os.close;is.close;}publicstaticvoidmain(Stringargs){questionId="37787176";Sitesite=newSite;site.setCharset("UTF-8");Spider.create(new).addUrl(+questionId)// .addPipeline(new GetquestionUrlPipeline).thread(10).run;}}

20170903 15:49

爬某個用戶所有回答中的圖片的代碼也寫好了,順便爬了下小仙女的圖片,大家可以試試,代碼里還有不少瑕疵,暫不做優化了.

packagewebMagic;importlombok.extern.slf4j.Slf4j;importorg.apache.commons.lang.math.RandomUtils;importus.codecraft.webmagic.Site;importus.codecraft.webmagic.Spider;importus.codecraft.webmagic.processor.PageProcessor;importus.codecraft.webmagic.selector.JsonPathSelector;importjava.io.File;importjava.io.FileOutputStream;importjava.io.InputStream;importjava.io.OutputStream;importjava.net.URL;importjava.net.URLConnection;importjava.util.List;/** * Created by Albee on 2017/4/13. * Optimized by Stephen Cai on 2017/9/3 */@Slf4jpublicclassZhihuHomePageProcessorimplementsPageProcessor{// 設置編碼 ,超時時間,重試次數,privateSitesite=Site.me.setRetryTimes(10).setSleepTime(5000).setTimeOut(5000).addCookie("Domain""zhihu.com").addCookie("z_c0"你的cookie).setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36");privatestaticfinalStringURL_answer="https://www\\.zhihu\\.com/question/\\d+/answer/\\d+";privatestaticfinalStringURL_ANSWERS="https://www\\.zhihu\\.com/people/.*/answers";privatestaticStringuserName="";@Overridepublicvoidprocess(us.codecraft.webmagic.Pagepage){if(page.getUrl.regex(URL_ANSWERS).match){inttotal=20;inttime=total/20;page.setCharset("UTF-8");for(inti=0;i<=time;i++){intoffset=i*20;intlimit=total<(i+1)*20?total:((i+1)*20-1);Stringurl="https://www.zhihu.com/api/v4/members/"+userName+"/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset="+offset+"&limit="+limit+"&sort_by=created";page.addTargetRequest(url);}}elseif(page.getUrl.regex(URL_answer).match){List<StringurlList=page.getHtml.xpath("//div[@class=RichContent-inner]//img/@src").all;StringfilePath=你要存的路徑+userName;urlList.forEach(url->{try{downloadPicture(urlfilePathInteger.toString(RandomUtils.nextInt)+url.substring(url.lastIndexOf("."),url.length));}catch(Exceptione){e.printStackTrace;}});}else{List<StringanswerIds=newJsonPathSelector("$.data[*].id").selectList(page.getRawText);List<StringquestionIds=newJsonPathSelector("$.data[*].question.id").selectList(page.getRawText);for(inti=0;i<answerIds.size;i++){StringanswerUrl="https://www.zhihu.com/question/"+questionIds.get(i)+"/answer/"+answerIds.get(i);page.addTargetRequest(answerUrl);}}}@OverridepublicSitegetSite{returnsite;}publicstaticvoiddownloadPicture(StringurlStringStringsavePathStringfilename)throwsException{// 構造URLURLurl=newURL(urlString);// 打開連接URLConnectioncon=url.openConnection;//設置請求超時為5scon.setConnectTimeout(5*1000);// 輸入流InputStreamis=con.getInputStream;// 1K的數據緩衝bytebs=newbyte[1024];// 讀取到的數據長度intlen;// 輸出的文件流Filefile=newFile(savePath+"/"+filename);if(!file.exists){file.getParentFile.mkdirs;file.createNewFile;}OutputStreamos=newFileOutputStream(file);// 開始讀取while((len=is.read(bs))!=-1){os.write(bs0len);}// 完畢,關閉所有鏈接os.close;is.close;}publicstaticvoidmain(Stringargs){userName="xiao-xian-nu-albee";//個人主頁的網址Stringurl="https://www.zhihu.com/people/xiao-xian-nu-albee/answers";Spider.create(new).addUrl(url)// .addPipeline(new GetquestionUrlPipeline).thread(10).run;}}

最後感謝下源碼作者 @黃億華
貼上webMagic github地址 code4craft/webmagic



熱門推薦

本文由 yidianzixun 提供 原文連結

寵物協尋 相信 終究能找到回家的路
寫了7763篇文章,獲得2次喜歡
留言回覆
回覆
精彩推薦