commit 889bb8d3c0eaccd0780a65a7de2cb41d1eec3506 Author: louzin <294098546@qq.com> Date: Thu May 4 09:34:15 2023 +0800 Init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5ff6309 --- /dev/null +++ b/.gitignore @@ -0,0 +1,38 @@ +target/ +!.mvn/wrapper/maven-wrapper.jar +!**/src/main/**/target/ +!**/src/test/**/target/ + +### IntelliJ IDEA ### +.idea/modules.xml +.idea/jarRepositories.xml +.idea/compiler.xml +.idea/libraries/ +*.iws +*.iml +*.ipr + +### Eclipse ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ +build/ +!**/src/main/**/build/ +!**/src/test/**/build/ + +### VS Code ### +.vscode/ + +### Mac OS ### +.DS_Store \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..0a8642f --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,10 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Zeppelin ignored files +/ZeppelinRemoteNotebooks/ diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..a12db2d --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..accd629 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,14 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/louzin_hdfsapi/.gitignore b/louzin_hdfsapi/.gitignore new file mode 100644 index 0000000..5ff6309 --- /dev/null +++ b/louzin_hdfsapi/.gitignore @@ -0,0 +1,38 @@ +target/ +!.mvn/wrapper/maven-wrapper.jar +!**/src/main/**/target/ +!**/src/test/**/target/ + +### IntelliJ IDEA ### +.idea/modules.xml +.idea/jarRepositories.xml +.idea/compiler.xml +.idea/libraries/ +*.iws +*.iml +*.ipr + +### Eclipse ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ +build/ +!**/src/main/**/build/ +!**/src/test/**/build/ + +### VS Code ### +.vscode/ + +### Mac OS ### +.DS_Store \ No newline at end of file diff --git a/louzin_hdfsapi/pom.xml b/louzin_hdfsapi/pom.xml new file mode 100644 index 0000000..3951eed --- /dev/null +++ b/louzin_hdfsapi/pom.xml @@ -0,0 +1,47 @@ + + + 4.0.0 + + com.louzin.openhdh + louzin + 1.0-SNAPSHOT + + + com.louzin.hdfsapi + louzin_hdfsapi + + + org.apache.hadoop + hadoop-common + 2.9.0 + + + org.apache.hadoop + hadoop-client + 2.9.0 + + + org.apache.hadoop + hadoop-hdfs + 2.9.0 + + + org.apache.hadoop + hadoop-mapreduce-client-core + 2.9.0 + + + junit + junit + RELEASE + + + + 11 + 11 + UTF-8 + + + \ No newline at end of file diff --git a/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/bean/WebLogBean.java b/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/bean/WebLogBean.java new file mode 100644 index 0000000..feb68de --- /dev/null +++ b/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/bean/WebLogBean.java @@ -0,0 +1,167 @@ +package com.louzin.hdfsapi.bean; + +import org.apache.hadoop.io.Writable; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +/** + * 对接外部数据的层,表结构定义最好跟外部数据源保持一致 + * 术语: 贴源表 + * @author itcast + * + */ +public class WebLogBean implements Writable { + + private boolean valid = true;// 判断数据是否合法 + private String remote_addr;// 记录客户端的ip地址 + private String remote_user;// 记录客户端用户名称,忽略属性"-" + private String time_local;// 记录访问时间与时区 + private String request;// 记录请求的url与http协议 + private String status;// 记录请求状态;成功是200 + private String body_bytes_sent;// 记录发送给客户端文件主体内容大小 + private String http_referer;// 用来记录从那个页面链接访问过来的 + private String http_user_agent;// 记录客户浏览器的相关信息 + + //设置属性值 + public void set(boolean valid,String remote_addr, String remote_user, String time_local, String request, String status, String body_bytes_sent, String http_referer, String http_user_agent) { + this.valid = valid; + this.remote_addr = remote_addr; + this.remote_user = remote_user; + this.time_local = time_local; + this.request = request; + this.status = status; + this.body_bytes_sent = body_bytes_sent; + this.http_referer = http_referer; + this.http_user_agent = http_user_agent; + } + + public String getRemote_addr() { + return remote_addr; + } + + public void setRemote_addr(String remote_addr) { + this.remote_addr = remote_addr; + } + + public String getRemote_user() { + return remote_user; + } + + public void setRemote_user(String remote_user) { + this.remote_user = remote_user; + } + + public String getTime_local() { + return this.time_local; + } + + public void setTime_local(String time_local) { + this.time_local = time_local; + } + + public String getRequest() { + return request; + } + + public void setRequest(String request) { + this.request = request; + } + + public String getStatus() { + return status; + } + + public void setStatus(String status) { + this.status = status; + } + + public String getBody_bytes_sent() { + return body_bytes_sent; + } + + public void setBody_bytes_sent(String body_bytes_sent) { + this.body_bytes_sent = body_bytes_sent; + } + + public String getHttp_referer() { + return http_referer; + } + + public void setHttp_referer(String http_referer) { + this.http_referer = http_referer; + } + + public String getHttp_user_agent() { + return http_user_agent; + } + + public void setHttp_user_agent(String http_user_agent) { + this.http_user_agent = http_user_agent; + } + + public boolean isValid() { + return valid; + } + + public void setValid(boolean valid) { + this.valid = valid; + } + + /** + * 重写toString()方法,使用Hive默认分隔符进行分隔,为后期导入Hive表提供便利 + * @return + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(this.valid); + sb.append("\001").append(this.getRemote_addr()); + sb.append("\001").append(this.getRemote_user()); + sb.append("\001").append(this.getTime_local()); + sb.append("\001").append(this.getRequest()); + sb.append("\001").append(this.getStatus()); + sb.append("\001").append(this.getBody_bytes_sent()); + sb.append("\001").append(this.getHttp_referer()); + sb.append("\001").append(this.getHttp_user_agent()); + return sb.toString(); + } + + /** + * 序列化方法 + * @param in + * @throws IOException + */ + @Override + public void readFields(DataInput in) throws IOException { + this.valid = in.readBoolean(); + this.remote_addr = in.readUTF(); + this.remote_user = in.readUTF(); + this.time_local = in.readUTF(); + this.request = in.readUTF(); + this.status = in.readUTF(); + this.body_bytes_sent = in.readUTF(); + this.http_referer = in.readUTF(); + this.http_user_agent = in.readUTF(); + } + + /** + * 反序列化方法 + * @param out + * @throws IOException + */ + @Override + public void write(DataOutput out) throws IOException { + out.writeBoolean(this.valid); + out.writeUTF(null==remote_addr?"":remote_addr); + out.writeUTF(null==remote_user?"":remote_user); + out.writeUTF(null==time_local?"":time_local); + out.writeUTF(null==request?"":request); + out.writeUTF(null==status?"":status); + out.writeUTF(null==body_bytes_sent?"":body_bytes_sent); + out.writeUTF(null==http_referer?"":http_referer); + out.writeUTF(null==http_user_agent?"":http_user_agent); + } + +} diff --git a/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/util/WebLogParser.java b/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/util/WebLogParser.java new file mode 100644 index 0000000..fcb69cf --- /dev/null +++ b/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/util/WebLogParser.java @@ -0,0 +1,71 @@ +package com.louzin.hdfsapi.util; +import com.louzin.hdfsapi.bean.WebLogBean; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Locale; +import java.util.Set; + + +public class WebLogParser { + + //定义时间格式 + public static SimpleDateFormat df1 = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.US); + public static SimpleDateFormat df2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US); + + public static WebLogBean parser(String line) { + WebLogBean webLogBean = new WebLogBean(); + //把一行数据以空格字符切割并存入数组arr中 + + //157.55.32.96 - - [18/Sep/2013:19:36:07 +0000] "GET /robots.txt HTTP/1.1" 404 169 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" + //false 157.55.32.96 - - 2013-09-18 19:36:07 GET /robots.txt HTTP/1.1 404 169 "-" Mozilla/5. (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) + String[] arr = line.split(" "); + //如果数组长度小于等于11,说明这条数据不完整,因此可以忽略这条数据 + if (arr.length > 11) { + //满足条件的数据逐个赋值给webLogBean对象 + webLogBean.setRemote_addr(arr[0]); + webLogBean.setRemote_user(arr[1]); + String time_local = formatDate(arr[3].substring(1));//[18/Sep/2013:19:36:07 -> 18/Sep/2013:19:36:07->2013-09-18 19:36:07 + if(null==time_local || "".equals(time_local)) time_local="-invalid_time-"; + webLogBean.setTime_local(time_local); + webLogBean.setRequest(arr[6]); + webLogBean.setStatus(arr[8]); + webLogBean.setBody_bytes_sent(arr[9]); + webLogBean.setHttp_referer(arr[10]); + //如果useragent元素较多,拼接useragent + if (arr.length > 12) { + StringBuilder sb = new StringBuilder(); + for(int i=11;i= 400) {// 大于400,HTTP错误 + webLogBean.setValid(false); + } + if("-invalid_time-".equals(webLogBean.getTime_local())){ + webLogBean.setValid(false); + } + } else { + webLogBean=null; + } + return webLogBean; + } + + //添加标识 + public static void filtStaticResource(WebLogBean bean, Set pages) { + if (!pages.contains(bean.getRequest())) { + bean.setValid(false); + } + } + //格式化时间方法 + public static String formatDate(String time_local) { + try { + return df2.format(df1.parse(time_local)); + } catch (ParseException e) { + return null; + } + } + +} diff --git a/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/util/WeblogPreProcess.java b/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/util/WeblogPreProcess.java new file mode 100644 index 0000000..b473f59 --- /dev/null +++ b/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/util/WeblogPreProcess.java @@ -0,0 +1,73 @@ +package com.louzin.hdfsapi.util; + +import com.louzin.hdfsapi.bean.WebLogBean; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +/** + * 处理原始日志,过滤出真实请求数据,转换时间格式,对缺失字段填充默认值,对记录标记valid和invalid + */ +public class WeblogPreProcess { + + public static void main(String[] args) throws Exception { + Configuration conf = new Configuration(); + conf.set("mapreduce.framework.name","local"); + Job job = Job.getInstance(conf); + job.setJarByClass(WeblogPreProcess.class); + job.setMapperClass(WeblogPreProcessMapper.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(NullWritable.class); + FileInputFormat.setInputPaths(job, new Path("hdfs://local1:8020/flume/events/23-05-02/2040")); + FileOutputFormat.setOutputPath(job, new Path("hdfs://local1:8020/result/counts.txt")); + job.setNumReduceTasks(1); + boolean res = job.waitForCompletion(true); + System.exit(res ? 0 : 1); + } + + public static class WeblogPreProcessMapper extends Mapper { + // 用来存储网站url分类数据 + Set pages = new HashSet(); + Text k = new Text(); + NullWritable v = NullWritable.get(); + /** + * 设置初始化方法,加载网站需要分析的url分类数据,存储到MapTask的内存中,用来对日志数据进行过滤 + * 如果用户请求的资源是以下列形式,就表示用户请求的是合法资源。 + */ + @Override + protected void setup(Context context) throws IOException, InterruptedException { + pages.add("/about"); + pages.add("/black-ip-list/"); + pages.add("/cassandra-clustor/"); + pages.add("/finance-rhive-repurchase/"); + pages.add("/hadoop-family-roadmap/"); + pages.add("/hadoop-hive-intro/"); + pages.add("/hadoop-zookeeper-intro/"); + pages.add("/hadoop-mahout-roadmap/"); + } + + @Override + protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { + //获取一行数据 + String line = value.toString(); + //调用解析类WebLogParser解析日志数据,最后封装为WebLogBean对象 + WebLogBean webLogBean = WebLogParser.parser(line); + if (webLogBean != null) { + // 过滤js/图片/css等静态资源 + WebLogParser.filtStaticResource(webLogBean, pages); + k.set(webLogBean.toString()); + context.write(k, v); + } + } + } +} diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..dd05477 --- /dev/null +++ b/pom.xml @@ -0,0 +1,21 @@ + + + 4.0.0 + + com.louzin.openhdh + louzin + 1.0-SNAPSHOT + pom + + louzin_hdfsapi + + + + 11 + 11 + UTF-8 + + + \ No newline at end of file