前言
WebMagic实现网页爬取,当然也可以使用HttpClient和jsoup实现
WebMagic四大组件
- Downloader 下载页面
- PageProcessor 解析页面
- Scheduler 管理url(去重)
- Pipeline 结果处理(持久化)
去重
HashSet:优点java自带方便,缺点占内存性能低
Redis:优点速度快处理大量数据,缺点需要redis服务器成本高
布隆过滤器(BloomFilter):优点占用内存比hashset小可以处理大量数据,缺点小概率的误判 不重复判定为重复
小案例
pom(相关jar)
<!--WebMagic核心包-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<!--WebMagic扩展-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<!--WebMagic对布隆过滤器的支持-->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>16.0</version>
</dependency>
java代码
package com.uibobo.task;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
/**
* @Author: bo bo
* @Date: 2020/8/31 22:49
*/
@Component
public class JobTask implements PageProcessor {
private String url="https://www.tianyancha.com/search?key=%E9%98%BF%E9%87%8C%E5%B7%B4%E5%B7%B4%E7%BD%91%E7%BB%9C%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8";
@Override
public void process(Page page) {
/*List<Selectable> nodes = page.getHtml().css("div.result-list div.content div.header").nodes();
if(nodes.size()>0){
for (Selectable node : nodes) {
String link = node.links().toString();
String name = node.xpath("//a[@class=name]/text()").toString();
System.out.println(name+":"+link);
}
}*/
List<Selectable> nodes = page.getHtml().css("div.result-list div.content div.header").nodes();
if(nodes.size()>0){
//取查询出来的一条
Selectable selectable = nodes.get(0);
String link = selectable.links().toString();
String name = selectable.xpath("//a[@class=name]").toString().replaceAll("\\<.*?>", "");
//阿里巴巴(中国)网络技术有限公司:https://www.tianyancha.com/company/1698375
System.out.println(name+":"+link);
page.addTargetRequest(link);//https://www.tianyancha.com/company/1698375
}else{
saveInfo(page);
}
}
//解析详情页
public void saveInfo(Page page){
Html html = page.getHtml();
Selectable selectable = html.css("table.-border-top-none tr").nodes().get(0);
//注册资本:1072526万美元
System.out.println(selectable.xpath("//td/text()").nodes().get(0).toString()+":"+selectable.xpath("//div[@title]/text()").toString());
Selectable selectable2 = html.css("table.-border-top-none tr").nodes().get(1);
//成立日期:1999-09-09
System.out.println(selectable2.xpath("//td/text()").nodes().get(0).toString()+":"+selectable2.xpath("//div[@title]/text()").toString());
Selectable selectable3 = html.css("table.-border-top-none tr").nodes().get(2);
//统一社会信用代码:91330100716105852F
System.out.println("统一社会信用代码:"+selectable3.xpath("//td/text()").nodes().get(1).toString());
//工商注册号:330100400015575
System.out.println(selectable3.xpath("//td/text()").nodes().get(2).toString()+":"+selectable3.xpath("//td/text()").nodes().get(3).toString());
}
private Site site=Site.me()
.setCharset("UTF-8") //编码
.setTimeOut(10*1000) //超时时间
.setRetrySleepTime(3000) //重试时间
.setRetryTimes(3)//重试次数
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
@Override
public Site getSite() {
return site;
}
//定时执行
@Scheduled(initialDelay = 1000,fixedDelay = 1000000)
public void process(){
Spider.create(new JobTask())
.addUrl(url)
//QueueScheduler队列保存抓取url。BloomFilterDuplicateRemover布隆过滤器 值根据实际进行设置,最好比预估(数据)值大(越大越占内存)
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100)))
.thread(3)//开三个线程,根据需求设置
.run();
}
}
如果需要登录后爬取,可以下载浏览器驱动模拟浏览器登录
以上代码仅供参考学习使用
Q.E.D.