mahout面试题?
之前看了Mahout官方示例 20news 的调用实现;于是想根据示例的流程实现其他例子。网上看到了一个关于天气适不适合打羽毛球的例子。
训练数据:
Day Outlook Temperature Humidity Wind PlayTennis
D1 Sunny Hot High Weak No
D2 Sunny Hot High Strong No
D3 Overcast Hot High Weak Yes
D4 Rain Mild High Weak Yes
D5 Rain Cool Normal Weak Yes
D6 Rain Cool Normal Strong No
D7 Overcast Cool Normal Strong Yes
D8 Sunny Mild High Weak No
D9 Sunny Cool Normal Weak Yes
D10 Rain Mild Normal Weak Yes
D11 Sunny Mild Normal Strong Yes
D12 Overcast Mild High Strong Yes
D13 Overcast Hot Normal Weak Yes
D14 Rain Mild High Strong No
检测数据:
sunny,hot,high,weak
结果:
Yes=》 0.007039
No=》 0.027418
于是使用Java代码调用Mahout的工具类实现分类。
基本思想:
1. 构造分类数据。
2. 使用Mahout工具类进行训练,得到训练模型。
3。将要检测数据转换成vector数据。
4. 分类器对vector数据进行分类。
接下来贴下我的代码实现=》
1. 构造分类数据:
在hdfs主要创建一个文件夹路径 /zhoujainfeng/playtennis/input 并将分类文件夹 no 和 yes 的数据传到hdfs上面。
数据文件格式,如D1文件内容: Sunny Hot High Weak
2. 使用Mahout工具类进行训练,得到训练模型。
3。将要检测数据转换成vector数据。
4. 分类器对vector数据进行分类。
这三步,代码我就一次全贴出来;主要是两个类 PlayTennis1 和 BayesCheckData = =》
package myTesting.bayes;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob;
import org.apache.mahout.text.SequenceFilesFromDirectory;
import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles;
public class PlayTennis1 {
private static final String WORK_DIR = "hdfs://192.168.9.72:9000/zhoujianfeng/playtennis";
/*
* 测试代码
*/
public static void main(String[] args) {
//将训练数据转换成 vector数据
makeTrainVector();
//产生训练模型
makeModel(false);
//测试检测数据
BayesCheckData.printResult();
}
public static void makeCheckVector(){
//将测试数据转换成序列化文件
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"testinput";
String output = WORK_DIR+Path.SEPARATOR+"tennis-test-seq";
Path in = new Path(input);
Path out = new Path(output);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
SequenceFilesFromDirectory sffd = new SequenceFilesFromDirectory();
String[] params = new String[]{"-i",input,"-o",output,"-ow"};
ToolRunner.run(sffd, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("文件序列化失败!");
System.exit(1);
}
//将序列化文件转换成向量文件
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"tennis-test-seq";
String output = WORK_DIR+Path.SEPARATOR+"tennis-test-vectors";
Path in = new Path(input);
Path out = new Path(output);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
SparseVectorsFromSequenceFiles svfsf = new SparseVectorsFromSequenceFiles();
String[] params = new String[]{"-i",input,"-o",output,"-lnorm","-nv","-wt","tfidf"};
ToolRunner.run(svfsf, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("序列化文件转换成向量失败!");
System.out.println(2);
}
}
public static void makeTrainVector(){
//将测试数据转换成序列化文件
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"input";
String output = WORK_DIR+Path.SEPARATOR+"tennis-seq";
Path in = new Path(input);
Path out = new Path(output);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
SequenceFilesFromDirectory sffd = new SequenceFilesFromDirectory();
String[] params = new String[]{"-i",input,"-o",output,"-ow"};
ToolRunner.run(sffd, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("文件序列化失败!");
System.exit(1);
}
//将序列化文件转换成向量文件
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"tennis-seq";
String output = WORK_DIR+Path.SEPARATOR+"tennis-vectors";
Path in = new Path(input);
Path out = new Path(output);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
SparseVectorsFromSequenceFiles svfsf = new SparseVectorsFromSequenceFiles();
String[] params = new String[]{"-i",input,"-o",output,"-lnorm","-nv","-wt","tfidf"};
ToolRunner.run(svfsf, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("序列化文件转换成向量失败!");
System.out.println(2);
}
}
public static void makeModel(boolean completelyNB){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"tennis-vectors"+Path.SEPARATOR+"tfidf-vectors";
String model = WORK_DIR+Path.SEPARATOR+"model";
String labelindex = WORK_DIR+Path.SEPARATOR+"labelindex";
Path in = new Path(input);
Path out = new Path(model);
Path label = new Path(labelindex);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
if(fs.exists(label)){
//boolean参数是,是否递归删除的意思
fs.delete(label, true);
}
TrainNaiveBayesJob tnbj = new TrainNaiveBayesJob();
String[] params =null;
if(completelyNB){
params = new String[]{"-i",input,"-el","-o",model,"-li",labelindex,"-ow","-c"};
}else{
params = new String[]{"-i",input,"-el","-o",model,"-li",labelindex,"-ow"};
}
ToolRunner.run(tnbj, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("生成训练模型失败!");
System.exit(3);
}
}
}
package myTesting.bayes;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.mahout.classifier.naivebayes.BayesUtils;
import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.Vector.Element;
import org.apache.mahout.vectorizer.TFIDF;
import com.google.common.collect.ConcurrentHashMultiset;
import com.google.common.collect.Multiset;
public class BayesCheckData {
private static StandardNaiveBayesClassifier classifier;
private static Map<String, Integer> dictionary;
private static Map<Integer, Long> documentFrequency;
private static Map<Integer, String> labelIndex;
public void init(Configuration conf){
try {
String modelPath = "/zhoujianfeng/playtennis/model";
String dictionaryPath = "/zhoujianfeng/playtennis/tennis-vectors/dictionary.file-0";
String documentFrequencyPath = "/zhoujianfeng/playtennis/tennis-vectors/df-count";
String labelIndexPath = "/zhoujianfeng/playtennis/labelindex";
dictionary = readDictionnary(conf, new Path(dictionaryPath));
documentFrequency = readDocumentFrequency(conf, new Path(documentFrequencyPath));
labelIndex = BayesUtils.readLabelIndex(conf, new Path(labelIndexPath));
NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), conf);
classifier = new StandardNaiveBayesClassifier(model);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("检测数据构造成vectors初始化时报错。。。。");
System.exit(4);
}
}
/**
* 加载字典文件,Key: TermValue; Value:TermID
* @param conf
* @param dictionnaryDir
* @return
*/
private static Map<String, Integer> readDictionnary(Configuration conf, Path dictionnaryDir) {
Map<String, Integer> dictionnary = new HashMap<String, Integer>();
PathFilter filter = new PathFilter() {
@Override
public boolean accept(Path path) {
String name = path.getName();
return name.startsWith("dictionary.file");
}
};
for (Pair<Text, IntWritable> pair : new SequenceFileDirIterable<Text, IntWritable>(dictionnaryDir, PathType.LIST, filter, conf)) {
dictionnary.put(pair.getFirst().toString(), pair.getSecond().get());
}
return dictionnary;
}
/**
* 加载df-count目录下TermDoc频率文件,Key: TermID; Value:DocFreq
* @param conf
* @param dictionnaryDir
* @return
*/
private static Map<Integer, Long> readDocumentFrequency(Configuration conf, Path documentFrequencyDir) {
Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>();
PathFilter filter = new PathFilter() {
@Override
public boolean accept(Path path) {
return path.getName().startsWith("part-r");
}
};
for (Pair<IntWritable, LongWritable> pair : new SequenceFileDirIterable<IntWritable, LongWritable>(documentFrequencyDir, PathType.LIST, filter, conf)) {
documentFrequency.put(pair.getFirst().get(), pair.getSecond().get());
}
return documentFrequency;
}
public static String getCheckResult(){
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String classify = "NaN";
BayesCheckData cdv = new BayesCheckData();
cdv.init(conf);
System.out.println("init done...............");
Vector vector = new RandomAccessSparseVector(10000);
TFIDF tfidf = new TFIDF();
//sunny,hot,high,weak
Multiset<String> words = ConcurrentHashMultiset.create();
words.add("sunny",1);
words.add("hot",1);
words.add("high",1);
words.add("weak",1);
int documentCount = documentFrequency.get(-1).intValue(); // key=-1时表示总文档数
for (Multiset.Entry<String> entry : words.entrySet()) {
String word = entry.getElement();
int count = entry.getCount();
Integer wordId = dictionary.get(word); // 需要从dictionary.file-0文件(tf-vector)下得到wordID,
if (StringUtils.isEmpty(wordId.toString())){
continue;
}
if (documentFrequency.get(wordId) == null){
continue;
}
Long freq = documentFrequency.get(wordId);
double tfIdfValue = tfidf.calculate(count, freq.intValue(), 1, documentCount);
vector.setQuick(wordId, tfIdfValue);
}
// 利用贝叶斯算法开始分类,并提取得分最好的分类label
Vector resultVector = classifier.classifyFull(vector);
double bestScore = -Double.MAX_VALUE;
int bestCategoryId = -1;
for(Element element: resultVector.all()) {
int categoryId = element.index();
double score = element.get();
System.out.println("categoryId:"+categoryId+" score:"+score);
if (score > bestScore) {
bestScore = score;
bestCategoryId = categoryId;
}
}
classify = labelIndex.get(bestCategoryId)+"(categoryId="+bestCategoryId+")";
return classify;
}
public static void printResult(){
System.out.println("检测所属类别是:"+getCheckResult());
}
}
-
无相关信息
- 1清新职场环境,招聘优秀保洁服务人员
- 2如何选择适合您的玉泉路搬家公司及联系方式
- 3奉贤商城专业保洁服务标准详解
- 4广西玉林邮政社会招聘面试?
- 5大连搬家服务全解析:选择适合您的搬家公司
- 6丰台区搬家服务推荐及联系方式
- 7快速找到厂洼搬家公司电话,轻松搬家无忧
- 8揭秘万寿路搬家公司电话,快速搬家从此不再烦恼
- 9丰台区搬家攻略:选择最适合你的搬家公司
- 10广州长途搬家服务:如何选择合适的搬家公司及其联系方式
- 11木樨地搬家服务全面指南:如何快速找到合适的搬家公司电话
- 12在宣武门找搬家公司?拨打这些电话轻松搬家!
- 13西二旗搬家服务全攻略:如何找到最靠谱的搬家公司电话
- 14临河搬家公司:专业服务助您轻松搬家
- 15选择异地搬家的物流公司,这几点必须考虑!
- 16可靠的十八里店搬家公司:联系电话与服务指南
- 17创造搬家新体验:大众搬家公司官网全揭秘
- 18武汉优质店铺保洁服务,让您的店面焕然一新
- 19青浦物业小区保洁服务的魅力与选择
- 20如何选择合适的保洁服务?美女保洁员的优势与魅力
- 21提升家政服务水平:普陀家政保洁服务培训解析
- 22提升均安诊所环境的保洁服务方案:健康与舒适双重保障
- 23银行保洁服务方案的标准与最佳实践
- 24提升清洁技能,在西藏寻找最佳保洁服务培训学校
- 25让生活更轻松:大型蓝领公寓保洁服务的优势与选择
- 26义乌商贸区保洁服务:为您的商务环境保驾护航
- 27全面解析绿色保洁服务的多种方法
- 28打造洁净空间:武昌大楼日常保洁服务全解析
- 29无锡到上海搬家公司电话推荐:靠谱服务一键直达
- 30辽宁2023年省考公告时间?