前言

WebMagic实现网页爬取,当然也可以使用HttpClient和jsoup实现

WebMagic四大组件

  • Downloader 下载页面
  • PageProcessor 解析页面
  • Scheduler 管理url(去重)
  • Pipeline 结果处理(持久化)

去重
HashSet:优点java自带方便,缺点占内存性能低

Redis:优点速度快处理大量数据,缺点需要redis服务器成本高

布隆过滤器(BloomFilter):优点占用内存比hashset小可以处理大量数据,缺点小概率的误判 不重复判定为重复

小案例

pom(相关jar)

 <!--WebMagic核心包-->
    <dependency>
        <groupId>us.codecraft</groupId>
        <artifactId>webmagic-core</artifactId>
        <version>0.7.3</version>
        <exclusions>
            <exclusion>
                <groupId>org.slf4j</groupId>
                <artifactId>slf4j-log4j12</artifactId>
            </exclusion>
        </exclusions>
    </dependency>
    <!--WebMagic扩展-->
    <dependency>
        <groupId>us.codecraft</groupId>
        <artifactId>webmagic-extension</artifactId>
        <version>0.7.3</version>
    </dependency>
    <!--WebMagic对布隆过滤器的支持-->
    <dependency>
        <groupId>com.google.guava</groupId>
        <artifactId>guava</artifactId>
        <version>16.0</version>
    </dependency>

java代码

package com.uibobo.task;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;

/**
  * @Author: bo bo
  * @Date: 2020/8/31 22:49
*/
@Component
public class JobTask implements PageProcessor {
  private String url="https://www.tianyancha.com/search?key=%E9%98%BF%E9%87%8C%E5%B7%B4%E5%B7%B4%E7%BD%91%E7%BB%9C%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8";

@Override
public void process(Page page) {
    /*List<Selectable> nodes = page.getHtml().css("div.result-list div.content div.header").nodes();
    if(nodes.size()>0){
        for (Selectable node : nodes) {
            String link = node.links().toString();
            String name = node.xpath("//a[@class=name]/text()").toString();
            System.out.println(name+":"+link);
        }
    }*/
    List<Selectable> nodes = page.getHtml().css("div.result-list div.content div.header").nodes();
    if(nodes.size()>0){
        //取查询出来的一条
        Selectable selectable = nodes.get(0);
        String link = selectable.links().toString();
        String name = selectable.xpath("//a[@class=name]").toString().replaceAll("\\<.*?>", "");
        //阿里巴巴(中国)网络技术有限公司:https://www.tianyancha.com/company/1698375
        System.out.println(name+":"+link);
        page.addTargetRequest(link);//https://www.tianyancha.com/company/1698375
    }else{
        saveInfo(page);
    }
}

//解析详情页
public void saveInfo(Page page){
    Html html = page.getHtml();
    Selectable selectable = html.css("table.-border-top-none tr").nodes().get(0);
    
    //注册资本:1072526万美元
    System.out.println(selectable.xpath("//td/text()").nodes().get(0).toString()+":"+selectable.xpath("//div[@title]/text()").toString());

    Selectable selectable2 = html.css("table.-border-top-none tr").nodes().get(1);
    //成立日期:1999-09-09
    System.out.println(selectable2.xpath("//td/text()").nodes().get(0).toString()+":"+selectable2.xpath("//div[@title]/text()").toString());

    Selectable selectable3 = html.css("table.-border-top-none tr").nodes().get(2);

    //统一社会信用代码:91330100716105852F
    System.out.println("统一社会信用代码:"+selectable3.xpath("//td/text()").nodes().get(1).toString());
    //工商注册号:330100400015575
    System.out.println(selectable3.xpath("//td/text()").nodes().get(2).toString()+":"+selectable3.xpath("//td/text()").nodes().get(3).toString());

}

private Site site=Site.me()
        .setCharset("UTF-8")  //编码
        .setTimeOut(10*1000)  //超时时间
        .setRetrySleepTime(3000) //重试时间
        .setRetryTimes(3)//重试次数
        .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
@Override
public Site getSite() {
    return site;
}

//定时执行
@Scheduled(initialDelay = 1000,fixedDelay = 1000000)
public void process(){
    Spider.create(new JobTask())
            .addUrl(url)
            //QueueScheduler队列保存抓取url。BloomFilterDuplicateRemover布隆过滤器  值根据实际进行设置,最好比预估(数据)值大(越大越占内存)
            .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100)))
            .thread(3)//开三个线程,根据需求设置
            .run();
}
}

如果需要登录后爬取,可以下载浏览器驱动模拟浏览器登录

以上代码仅供参考学习使用

Q.E.D.