commit 889bb8d3c0eaccd0780a65a7de2cb41d1eec3506
Author: louzin <294098546@qq.com>
Date: Thu May 4 09:34:15 2023 +0800
Init
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5ff6309
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,38 @@
+target/
+!.mvn/wrapper/maven-wrapper.jar
+!**/src/main/**/target/
+!**/src/test/**/target/
+
+### IntelliJ IDEA ###
+.idea/modules.xml
+.idea/jarRepositories.xml
+.idea/compiler.xml
+.idea/libraries/
+*.iws
+*.iml
+*.ipr
+
+### Eclipse ###
+.apt_generated
+.classpath
+.factorypath
+.project
+.settings
+.springBeans
+.sts4-cache
+
+### NetBeans ###
+/nbproject/private/
+/nbbuild/
+/dist/
+/nbdist/
+/.nb-gradle/
+build/
+!**/src/main/**/build/
+!**/src/test/**/build/
+
+### VS Code ###
+.vscode/
+
+### Mac OS ###
+.DS_Store
\ No newline at end of file
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..0a8642f
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,10 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Zeppelin ignored files
+/ZeppelinRemoteNotebooks/
diff --git a/.idea/encodings.xml b/.idea/encodings.xml
new file mode 100644
index 0000000..a12db2d
--- /dev/null
+++ b/.idea/encodings.xml
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..accd629
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/louzin_hdfsapi/.gitignore b/louzin_hdfsapi/.gitignore
new file mode 100644
index 0000000..5ff6309
--- /dev/null
+++ b/louzin_hdfsapi/.gitignore
@@ -0,0 +1,38 @@
+target/
+!.mvn/wrapper/maven-wrapper.jar
+!**/src/main/**/target/
+!**/src/test/**/target/
+
+### IntelliJ IDEA ###
+.idea/modules.xml
+.idea/jarRepositories.xml
+.idea/compiler.xml
+.idea/libraries/
+*.iws
+*.iml
+*.ipr
+
+### Eclipse ###
+.apt_generated
+.classpath
+.factorypath
+.project
+.settings
+.springBeans
+.sts4-cache
+
+### NetBeans ###
+/nbproject/private/
+/nbbuild/
+/dist/
+/nbdist/
+/.nb-gradle/
+build/
+!**/src/main/**/build/
+!**/src/test/**/build/
+
+### VS Code ###
+.vscode/
+
+### Mac OS ###
+.DS_Store
\ No newline at end of file
diff --git a/louzin_hdfsapi/pom.xml b/louzin_hdfsapi/pom.xml
new file mode 100644
index 0000000..3951eed
--- /dev/null
+++ b/louzin_hdfsapi/pom.xml
@@ -0,0 +1,47 @@
+
+
+ 4.0.0
+
+ com.louzin.openhdh
+ louzin
+ 1.0-SNAPSHOT
+
+
+ com.louzin.hdfsapi
+ louzin_hdfsapi
+
+
+ org.apache.hadoop
+ hadoop-common
+ 2.9.0
+
+
+ org.apache.hadoop
+ hadoop-client
+ 2.9.0
+
+
+ org.apache.hadoop
+ hadoop-hdfs
+ 2.9.0
+
+
+ org.apache.hadoop
+ hadoop-mapreduce-client-core
+ 2.9.0
+
+
+ junit
+ junit
+ RELEASE
+
+
+
+ 11
+ 11
+ UTF-8
+
+
+
\ No newline at end of file
diff --git a/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/bean/WebLogBean.java b/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/bean/WebLogBean.java
new file mode 100644
index 0000000..feb68de
--- /dev/null
+++ b/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/bean/WebLogBean.java
@@ -0,0 +1,167 @@
+package com.louzin.hdfsapi.bean;
+
+import org.apache.hadoop.io.Writable;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * 对接外部数据的层,表结构定义最好跟外部数据源保持一致
+ * 术语: 贴源表
+ * @author itcast
+ *
+ */
+public class WebLogBean implements Writable {
+
+ private boolean valid = true;// 判断数据是否合法
+ private String remote_addr;// 记录客户端的ip地址
+ private String remote_user;// 记录客户端用户名称,忽略属性"-"
+ private String time_local;// 记录访问时间与时区
+ private String request;// 记录请求的url与http协议
+ private String status;// 记录请求状态;成功是200
+ private String body_bytes_sent;// 记录发送给客户端文件主体内容大小
+ private String http_referer;// 用来记录从那个页面链接访问过来的
+ private String http_user_agent;// 记录客户浏览器的相关信息
+
+ //设置属性值
+ public void set(boolean valid,String remote_addr, String remote_user, String time_local, String request, String status, String body_bytes_sent, String http_referer, String http_user_agent) {
+ this.valid = valid;
+ this.remote_addr = remote_addr;
+ this.remote_user = remote_user;
+ this.time_local = time_local;
+ this.request = request;
+ this.status = status;
+ this.body_bytes_sent = body_bytes_sent;
+ this.http_referer = http_referer;
+ this.http_user_agent = http_user_agent;
+ }
+
+ public String getRemote_addr() {
+ return remote_addr;
+ }
+
+ public void setRemote_addr(String remote_addr) {
+ this.remote_addr = remote_addr;
+ }
+
+ public String getRemote_user() {
+ return remote_user;
+ }
+
+ public void setRemote_user(String remote_user) {
+ this.remote_user = remote_user;
+ }
+
+ public String getTime_local() {
+ return this.time_local;
+ }
+
+ public void setTime_local(String time_local) {
+ this.time_local = time_local;
+ }
+
+ public String getRequest() {
+ return request;
+ }
+
+ public void setRequest(String request) {
+ this.request = request;
+ }
+
+ public String getStatus() {
+ return status;
+ }
+
+ public void setStatus(String status) {
+ this.status = status;
+ }
+
+ public String getBody_bytes_sent() {
+ return body_bytes_sent;
+ }
+
+ public void setBody_bytes_sent(String body_bytes_sent) {
+ this.body_bytes_sent = body_bytes_sent;
+ }
+
+ public String getHttp_referer() {
+ return http_referer;
+ }
+
+ public void setHttp_referer(String http_referer) {
+ this.http_referer = http_referer;
+ }
+
+ public String getHttp_user_agent() {
+ return http_user_agent;
+ }
+
+ public void setHttp_user_agent(String http_user_agent) {
+ this.http_user_agent = http_user_agent;
+ }
+
+ public boolean isValid() {
+ return valid;
+ }
+
+ public void setValid(boolean valid) {
+ this.valid = valid;
+ }
+
+ /**
+ * 重写toString()方法,使用Hive默认分隔符进行分隔,为后期导入Hive表提供便利
+ * @return
+ */
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(this.valid);
+ sb.append("\001").append(this.getRemote_addr());
+ sb.append("\001").append(this.getRemote_user());
+ sb.append("\001").append(this.getTime_local());
+ sb.append("\001").append(this.getRequest());
+ sb.append("\001").append(this.getStatus());
+ sb.append("\001").append(this.getBody_bytes_sent());
+ sb.append("\001").append(this.getHttp_referer());
+ sb.append("\001").append(this.getHttp_user_agent());
+ return sb.toString();
+ }
+
+ /**
+ * 序列化方法
+ * @param in
+ * @throws IOException
+ */
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ this.valid = in.readBoolean();
+ this.remote_addr = in.readUTF();
+ this.remote_user = in.readUTF();
+ this.time_local = in.readUTF();
+ this.request = in.readUTF();
+ this.status = in.readUTF();
+ this.body_bytes_sent = in.readUTF();
+ this.http_referer = in.readUTF();
+ this.http_user_agent = in.readUTF();
+ }
+
+ /**
+ * 反序列化方法
+ * @param out
+ * @throws IOException
+ */
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeBoolean(this.valid);
+ out.writeUTF(null==remote_addr?"":remote_addr);
+ out.writeUTF(null==remote_user?"":remote_user);
+ out.writeUTF(null==time_local?"":time_local);
+ out.writeUTF(null==request?"":request);
+ out.writeUTF(null==status?"":status);
+ out.writeUTF(null==body_bytes_sent?"":body_bytes_sent);
+ out.writeUTF(null==http_referer?"":http_referer);
+ out.writeUTF(null==http_user_agent?"":http_user_agent);
+ }
+
+}
diff --git a/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/util/WebLogParser.java b/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/util/WebLogParser.java
new file mode 100644
index 0000000..fcb69cf
--- /dev/null
+++ b/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/util/WebLogParser.java
@@ -0,0 +1,71 @@
+package com.louzin.hdfsapi.util;
+import com.louzin.hdfsapi.bean.WebLogBean;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Locale;
+import java.util.Set;
+
+
+public class WebLogParser {
+
+ //定义时间格式
+ public static SimpleDateFormat df1 = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.US);
+ public static SimpleDateFormat df2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US);
+
+ public static WebLogBean parser(String line) {
+ WebLogBean webLogBean = new WebLogBean();
+ //把一行数据以空格字符切割并存入数组arr中
+
+ //157.55.32.96 - - [18/Sep/2013:19:36:07 +0000] "GET /robots.txt HTTP/1.1" 404 169 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
+ //false 157.55.32.96 - - 2013-09-18 19:36:07 GET /robots.txt HTTP/1.1 404 169 "-" Mozilla/5. (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)
+ String[] arr = line.split(" ");
+ //如果数组长度小于等于11,说明这条数据不完整,因此可以忽略这条数据
+ if (arr.length > 11) {
+ //满足条件的数据逐个赋值给webLogBean对象
+ webLogBean.setRemote_addr(arr[0]);
+ webLogBean.setRemote_user(arr[1]);
+ String time_local = formatDate(arr[3].substring(1));//[18/Sep/2013:19:36:07 -> 18/Sep/2013:19:36:07->2013-09-18 19:36:07
+ if(null==time_local || "".equals(time_local)) time_local="-invalid_time-";
+ webLogBean.setTime_local(time_local);
+ webLogBean.setRequest(arr[6]);
+ webLogBean.setStatus(arr[8]);
+ webLogBean.setBody_bytes_sent(arr[9]);
+ webLogBean.setHttp_referer(arr[10]);
+ //如果useragent元素较多,拼接useragent
+ if (arr.length > 12) {
+ StringBuilder sb = new StringBuilder();
+ for(int i=11;i= 400) {// 大于400,HTTP错误
+ webLogBean.setValid(false);
+ }
+ if("-invalid_time-".equals(webLogBean.getTime_local())){
+ webLogBean.setValid(false);
+ }
+ } else {
+ webLogBean=null;
+ }
+ return webLogBean;
+ }
+
+ //添加标识
+ public static void filtStaticResource(WebLogBean bean, Set pages) {
+ if (!pages.contains(bean.getRequest())) {
+ bean.setValid(false);
+ }
+ }
+ //格式化时间方法
+ public static String formatDate(String time_local) {
+ try {
+ return df2.format(df1.parse(time_local));
+ } catch (ParseException e) {
+ return null;
+ }
+ }
+
+}
diff --git a/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/util/WeblogPreProcess.java b/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/util/WeblogPreProcess.java
new file mode 100644
index 0000000..b473f59
--- /dev/null
+++ b/louzin_hdfsapi/src/main/java/com/louzin/hdfsapi/util/WeblogPreProcess.java
@@ -0,0 +1,73 @@
+package com.louzin.hdfsapi.util;
+
+import com.louzin.hdfsapi.bean.WebLogBean;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * 处理原始日志,过滤出真实请求数据,转换时间格式,对缺失字段填充默认值,对记录标记valid和invalid
+ */
+public class WeblogPreProcess {
+
+ public static void main(String[] args) throws Exception {
+ Configuration conf = new Configuration();
+ conf.set("mapreduce.framework.name","local");
+ Job job = Job.getInstance(conf);
+ job.setJarByClass(WeblogPreProcess.class);
+ job.setMapperClass(WeblogPreProcessMapper.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(NullWritable.class);
+ FileInputFormat.setInputPaths(job, new Path("hdfs://local1:8020/flume/events/23-05-02/2040"));
+ FileOutputFormat.setOutputPath(job, new Path("hdfs://local1:8020/result/counts.txt"));
+ job.setNumReduceTasks(1);
+ boolean res = job.waitForCompletion(true);
+ System.exit(res ? 0 : 1);
+ }
+
+ public static class WeblogPreProcessMapper extends Mapper {
+ // 用来存储网站url分类数据
+ Set pages = new HashSet();
+ Text k = new Text();
+ NullWritable v = NullWritable.get();
+ /**
+ * 设置初始化方法,加载网站需要分析的url分类数据,存储到MapTask的内存中,用来对日志数据进行过滤
+ * 如果用户请求的资源是以下列形式,就表示用户请求的是合法资源。
+ */
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ pages.add("/about");
+ pages.add("/black-ip-list/");
+ pages.add("/cassandra-clustor/");
+ pages.add("/finance-rhive-repurchase/");
+ pages.add("/hadoop-family-roadmap/");
+ pages.add("/hadoop-hive-intro/");
+ pages.add("/hadoop-zookeeper-intro/");
+ pages.add("/hadoop-mahout-roadmap/");
+ }
+
+ @Override
+ protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+ //获取一行数据
+ String line = value.toString();
+ //调用解析类WebLogParser解析日志数据,最后封装为WebLogBean对象
+ WebLogBean webLogBean = WebLogParser.parser(line);
+ if (webLogBean != null) {
+ // 过滤js/图片/css等静态资源
+ WebLogParser.filtStaticResource(webLogBean, pages);
+ k.set(webLogBean.toString());
+ context.write(k, v);
+ }
+ }
+ }
+}
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..dd05477
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,21 @@
+
+
+ 4.0.0
+
+ com.louzin.openhdh
+ louzin
+ 1.0-SNAPSHOT
+ pom
+
+ louzin_hdfsapi
+
+
+
+ 11
+ 11
+ UTF-8
+
+
+
\ No newline at end of file