연구실에서 사용하는 크롤링 코드와

 https://gist.github.com/i88ca/2ebd274587b84f996726 의 트위터 로그인 방법을 참고 했습니다.


자바 1.8, HtmlUnit 2.1을 썼습니다(이전에 HtmlUnit 2.7 버전이라고 잘못썼음).


블로그에 소스코드 문법강조 기능을 넣으니 화면에 짤려보이는 소스코드가 있네요.


전문은 깃허브의 소스를 참고 바랍니다.


소스 링크:

 https://github.com/WoongheeLee/HtmlUnitCrawler/blob/master/huCrawler.java




package crawler;

import java.io.IOException;

public class huCrawler {
	// ACCOUNT INFORMATION
	private static String account_email = "EMAIL";
	private static String account_password = "PASSWORD";
	
	private WebClient webClient = null;
	
	private void init() {
		webClient = new WebClient(BrowserVersion.INTERNET_EXPLORER);
		webClient.setAjaxController(new NicelyResynchronizingAjaxController());
		webClient.getOptions().setThrowExceptionOnScriptError(false);
		webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
		webClient.getOptions().setJavaScriptEnabled(true);
		webClient.getOptions().setRedirectEnabled(true);
		webClient.getCookieManager().setCookiesEnabled(true);
		webClient.getOptions().setCssEnabled(false);
		
		webClient.waitForBackgroundJavaScript(1000);
	}
	
	
	private void loginTwitter() throws FailingHttpStatusCodeException, MalformedURLException, IOException {
		final HtmlPage page = webClient.getPage("https://mobile.twitter.com/session/new");
		final HtmlForm form = page.getForms().get(0);
		form.reset();
		
		HtmlTextInput username = (HtmlTextInput) form.getInputByName("session[username_or_email]");
		username.setValueAttribute(account_email);
		
		HtmlPasswordInput password = (HtmlPasswordInput) form.getInputByName("session[password]");
		password.setValueAttribute(account_password);
		
		HtmlInput button = form.getInputByName("commit");
		webClient.getOptions().setThrowExceptionOnScriptError(false);
		HtmlPage p = button.click();
	}
	
	private void getFollowing() throws FailingHttpStatusCodeException, MalformedURLException, IOException {
		final HtmlPage page = webClient.getPage("https://mobile.twitter.com/WoongheeLee2/following");
		
		List<?> following = page.getByXPath("//div[@class='user-list']//table[@class='user-item']//td[@class='info']//a");
		for (int i = 0; i < following.size(); i++) {
			String str = following.get(i).toString().substring(21, following.get(i).toString().length()-7);
			System.out.println(str);
		}
	}
	
	private void closePage() {
		webClient.close();
	}
	
	public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException  {
		huCrawler crawler = new huCrawler();
		crawler.init();
		crawler.loginTwitter();
		crawler.getFollowing();
		crawler.closePage();
	}
}


Posted by 공돌이pooh
,