使用SpringBoot+Mybatis plus+Webmagic爬取51job的职位信息,并保存到mysql数据库.
引入maven依赖
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.2.5.RELEASE</version> <relativePath/> <!-- lookup parent from repository --> </parent> <groupId>com.hg</groupId> <artifactId>spider-demo</artifactId> <version>0.0.1-SNAPSHOT</version> <name>spider-demo</name> <description>爬虫实战</description> <properties> <java.version>1.8</java.version> </properties> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <optional>true</optional> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <dependency> <groupId>cn.hutool</groupId> <artifactId>hutool-all</artifactId> <version>5.1.0</version> </dependency> <!-- druid数据库连接池 --> <dependency> <groupId>com.alibaba</groupId> <artifactId>druid-spring-boot-starter</artifactId> <version>1.1.10</version> </dependency> <!-- mysql connector --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <scope>runtime</scope> </dependency> <!-- Mybatis-plus --> <dependency> <groupId>com.baomidou</groupId> <artifactId>mybatis-plus-boot-starter</artifactId> <version>3.0.5</version> </dependency> <!--webmagic--> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> <version>16.0</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> </plugin> </plugins> </build> </project> 复制代码
创建数据库spider,新建表job_info
CREATE TABLE `job_info` ( `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键id', `company_name` varchar(100) DEFAULT NULL COMMENT '公司名称', `company_addr` varchar(200) DEFAULT NULL COMMENT '公司联系方式', `job_name` varchar(100) DEFAULT NULL COMMENT '职位名称', `job_addr` varchar(50) DEFAULT NULL COMMENT '工作地点', `salary` varchar(50) DEFAULT NULL COMMENT '薪资范围', `url` varchar(150) DEFAULT NULL COMMENT '招聘信息详情页', `time` varchar(10) DEFAULT NULL COMMENT '职位最近发布时间', `job_detail` text COMMENT '职位详情', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8 COMMENT='招聘信息'; 复制代码
创建application.yml
spring: application: name: spider-servoce jackson: time-zone: GMT+8 date-format: yyyy-MM-dd HH:mm:ss datasource: driver-class-name: com.mysql.cj.jdbc.Driver url: jdbc:mysql://localhost:3306/spider?useUnicode=true&characterEncoding=utf8&autoReconnect=true&useSSL=false username: root password: root type: com.alibaba.druid.pool.DruidDataSource druid: initialSize: 10 minIdle: 10 maxActive: 50 maxWait: 60000 timeBetweenEvictionRunsMillis: 60000 minEvictableIdleTimeMillis: 300000 validationQuery: SELECT 1 FROM DUAL testWhileIdle: true testOnBorrow: false testOnReturn: false poolPreparedStatements: true maxPoolPreparedStatementPerConnectionSize: 20 filters: stat,wall connectionProperties: druid.stat.mergeSql=true;druid.stat.slowSqlMillis=5000 #mybatis mybatis-plus: mapper-locations: classpath:mapper/**/*.xml typeAliasesPackage: com.hg.*.entity global-config: db-config: id-type: auto field-strategy: not_empty table-underline: true db-type: mysql refresh: true configuration: map-underscore-to-camel-case: true cache-enabled: false logging: level: org.springframework.web: info org.apache.http: info us.codecraft.webmagic: info 复制代码
package com.hg.spider.entity; import com.baomidou.mybatisplus.annotation.TableId; import com.baomidou.mybatisplus.annotation.TableName; import lombok.Data; import lombok.extern.slf4j.Slf4j; @Data @TableName("job_info") @Slf4j public class JobInfo { @TableId private Long id; /** * 公司名 */ private String companyName; /** * 公司地址 */ private String companyAddr; /** * 工作名称 */ private String jobName; /** * 工作地址 */ private String jobAddr; /** * 工作详情 */ private String jobDetail; /** * 薪资 */ private String salary; /** * 爬取的url */ private String url; /** * 职位发布时间 */ private String time; } 复制代码
package com.hg.spider.dao; import com.baomidou.mybatisplus.core.mapper.BaseMapper; import com.hg.spider.entity.JobInfo; /** * @Author skh * @Date 2020/3/21 16:27 * @Desc */ public interface JobInfoDao extends BaseMapper<JobInfo> { } 复制代码
package com.hg.spider.service; import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper; import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; import com.hg.spider.dao.JobInfoDao; import com.hg.spider.entity.JobInfo; import com.hg.spider.webmagic.JobProcessor; import com.hg.spider.webmagic.MysqlPipeline; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; import us.codecraft.webmagic.scheduler.QueueScheduler; import java.util.List; /** * @Author skh * @Date 2020/3/21 12:10 * @Desc */ @Service @Slf4j public class JobInfoService extends ServiceImpl<JobInfoDao, JobInfo> { //开始爬取的url String url = "https://search.51job.com/list/080200,000000,0000,26,9,99,%25E6%2588%25BF%25E4%25BA%25A7%25E7%25BB%258F%25E7%25BA%25AA%25E4%25BA%25BA,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="; @Autowired private MysqlPipeline mysqlPipeline; @Autowired private JobProcessor jobProcessor; public void getJobInfo() { log.info("开始爬取数据"); //设置爬虫配置 Spider.create(jobProcessor) .addUrl(url) //设置初始爬取的url //使用布隆过滤器过滤重复url,需要引入guava包 .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000))) .thread(50) //设置线程数 .addPipeline(mysqlPipeline) //设置持久化 .run(); } public List<JobInfo> selectJobInfoByUrl(String url) { QueryWrapper<JobInfo> wrapper = new QueryWrapper<>(); wrapper.eq("url", url); List<JobInfo> jobInfos = this.baseMapper.selectList(wrapper); return jobInfos; } } 复制代码
package com.hg.spider.controller; import com.hg.spider.service.JobInfoService; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.RestController; /** * @Author skh * @Date 2020/3/21 12:24 * @Desc */ @RestController public class JobInfoController { @Autowired private JobInfoService jobInfoService; @GetMapping("/getJobInfo") public String getJobInfo() { jobInfoService.getJobInfo(); return "success"; } } 复制代码
package com.hg.spider.webmagic; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.util.StrUtil; import com.hg.spider.entity.JobInfo; import com.hg.spider.service.JobInfoService; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import java.util.List; /** * @Author skh * @Date 2020/3/20 22:56 * @Desc 解析页面 */ @Component @Slf4j public class JobProcessor implements PageProcessor { @Autowired private JobInfoService jobInfoService; /** * 解析页面 * @param page */ @Override public void process(Page page) { //解析列表页 List<Selectable> nodes = page.getHtml().css("div#resultList div.el").nodes(); if (CollUtil.isEmpty(nodes)) { //为空表示这是招聘详情页,解析页面,获取招聘详情信息,保存数据 try { this.saveJobInfo(page); } catch (Exception e) { log.error("解析异常,异常原因:{}", e.getMessage(),e); } } else { //不为空表示这是列表页,解析出详情页url,放到任务队列中 for (Selectable node : nodes) { //获取url地址 String jobInfoUrl = node.css("p.t1 span a").links().toString(); if (StrUtil.isNotBlank(jobInfoUrl)) { //判断记录是否已存在 List<JobInfo> jobInfoList = jobInfoService.selectJobInfoByUrl(jobInfoUrl); if (CollUtil.isEmpty(jobInfoList)) { //把url放到任务队列中 page.addTargetRequest(jobInfoUrl); } else { log.info("记录已存在,记录url:{}",jobInfoUrl); } } } //获取下一页的url List<String> all = page.getHtml().css("div.p_in li.bk a").links().all(); String bkUrl = all.get(all.size() - 1); log.info("下一页Url:{}", bkUrl); if (StrUtil.containsAny(bkUrl, "11.html")) { System.out.println("已查到10页数据,无须无限爬取数据"); return; } page.addTargetRequest(bkUrl); } } /** * 解析job详情页 * @param page */ private void saveJobInfo(Page page) { //解析页面 Html html = page.getHtml(); String companyName = html.css("div.cn p.cname a", "text").get(); List<String> text = html.css("div.bmsg.inbox p.fp", "text").all(); String companyAddr = text.get(text.size() - 1); String jobName = html.css("div.cn h1", "text").get(); String jobStr = html.css("p.msg.ltype", "text").get(); String[] s = StrUtil.split(jobStr, " "); String jobAddr = s[0]; String time = ""; for (String s1 : s) { if (StrUtil.containsAny(s1, "发布")) { time = StrUtil.removeAll(s1, "发布"); break; } } String jonDetail = html.css("div.bmsg.job_msg.inbox", "allText").get(); String url = page.getUrl().get(); String salary = html.css("div.in div.cn strong", "text").get(); JobInfo jobInfo = new JobInfo(); jobInfo.setJobName(jobName); jobInfo.setJobAddr(jobAddr); jobInfo.setJobDetail(jonDetail); jobInfo.setSalary(salary); jobInfo.setUrl(url); jobInfo.setTime(time); jobInfo.setCompanyName(companyName); jobInfo.setCompanyAddr(companyAddr); //把结果保存到resultItems,为了持久化 page.putField("jobInfo", jobInfo); } //配置爬虫信息 private Site site = Site.me() .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36") .setCharset("gbk") .setTimeOut(10 * 1000) .setRetryTimes(3) .setRetrySleepTime(3000); @Override public Site getSite() { return site; } } 复制代码
package com.hg.spider.webmagic; import com.hg.spider.entity.JobInfo; import com.hg.spider.service.JobInfoService; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; /** * @Author skh * @Date 2020/3/21 16:18 * @Desc */ @Component @Slf4j public class MysqlPipeline implements Pipeline { @Autowired private JobInfoService jobInfoService; @Override public void process(ResultItems resultItems, Task task) { //获取封装好的数据 JobInfo jobInfo = resultItems.get("jobInfo"); if (jobInfo != null) { jobInfoService.save(jobInfo); } } } 复制代码