Hadoop

cloudera hadoop

Install

rpm -ivh cdh3-repository-1.0-1.noarch.rpm
 
yum install hadoop-0.20-conf-pseudo hadoop-0.20-native

config文件

core-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 
<configuration>
  <property>
    <name>fs.default.name</name>
    <value>hdfs://namenode.mobcon.inside:8020</value>
  </property>
 
  <property>
     <name>hadoop.tmp.dir</name>
     <value>/var/lib/hadoop-0.20/cache/${user.name}</value>
  </property>
 
  <!-- OOZIE proxy user setting
  <property>
    <name>hadoop.proxyuser.oozie.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.oozie.groups</name>
    <value>*</value>
  </property>
  -->
</configuration>

hdfs-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 
<configuration>
  <property>
    <name>dfs.replication</name>
    <value>3</value>
  </property>
  <property>
     <name>dfs.permissions</name>
     <value>false</value>
  </property>
     <!-- specify this so that running 'hadoop namenode -format' formats the right dir
  <property>
     <name>dfs.name.dir</name>
     <value>/var/lib/hadoop-0.20/cache/hadoop/dfs/name</value>
  </property>
          -->
 
  <!-- Enable Hue Plugins
  <property>
    <name>dfs.namenode.plugins</name>
    <value>org.apache.hadoop.thriftfs.NamenodePlugin</value>
    <description>Comma-separated list of namenode plug-ins to be activated.
    </description>
  </property>
  <property>
    <name>dfs.datanode.plugins</name>
    <value>org.apache.hadoop.thriftfs.DatanodePlugin</value>
    <description>Comma-separated list of datanode plug-ins to be activated.
    </description>
  </property>
  <property>
    <name>dfs.thrift.address</name>
    <value>0.0.0.0:10090</value>
  </property>
       -->
</configuration>

mapred-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 
<configuration>
  <property>
    <name>mapred.job.tracker</name>
    <value>namenode.mobcon.inside:8021</value>
  </property>
 
  <!-- Enable Hue plugins
  <property>
    <name>mapred.jobtracker.plugins</name>
    <value>org.apache.hadoop.thriftfs.ThriftJobTrackerPlugin</value>
    <description>Comma-separated list of jobtracker plug-ins to be activated.
    </description>
  </property>
  <property>
    <name>jobtracker.thrift.address</name>
    <value>0.0.0.0:9290</value>
  </property>
   -->
</configuration>

master

namenode.mobcon.inside

slaves

dark94025.mobcon.inside
dark94026.mobcon.inside
dark94027.mobcon.inside
dark94028.mobcon.inside
dark94029.mobcon.inside
dark94030.mobcon.inside
dark94031.mobcon.inside
dark94032.mobcon.inside
dark94033.mobcon.inside
dark94034.mobcon.inside
dark94035.mobcon.inside

start启动

service hadoop-0.20-namenode start
service hadoop-0.20-datanode start

测试

su - hdfs
hadoop jar /usr/lib/hadoop/hadoop-examples.jar pi 4 2000

基本安装手顺

快速安装手顺,如果你刚接触hadoop请不要参考这篇文章,这是我工作中的命令手顺对新手没有任何帮助.
rpm -Uvh http://download.fedora.redhat.com/pub/epel/5/x86_64/epel-release-5-3.noarch.rpm
rpm -Uvh http://apt.sw.be/redhat/el5/en/x86_64/RPMS.dag/rpmforge-release-0.3.6-1.el5.rf.x86_64.rpm
rm -rf /etc/munin/plugins/sendmail_mail*
rm -rf /etc/munin/plugins/netstat
rm -rf /etc/munin/plugins/interrupts
rm -rf /etc/munin/plugins/irqstats
rm -rf /etc/munin/plugins/entropy
rm -rf /etc/munin/plugins/nfs_client
rm -rf /etc/munin/plugins/ntp_offset
sed -i '37s/.*/allow ^10\\\.40\\\.0\\\.2$/' /etc/munin/munin-node.conf
service munin-node restart
 
yum -y install nagios-plugins-nrpe.x86_64 nagios-nrpe.x86_64
yum -y install nagios-plugins-*
 
yum -y install nrpe.x86_64
sed -i '79s/.*/allowed_hosts=127\.0\.0\.1,10\.40\.0\.2/' /etc/nagios/nrpe.cfg
sed -i '204s/.*/command[check_ssh]=\/usr\/lib64\/nagios\/plugins\/check_ssh -H 127\.0\.0\.1/' /etc/nagios/nrpe.cfg
service nrpe start
 
yum -y install ncftp
 
sed -i '77s/.*/yepn        ALL=(ALL)       ALL/' /etc/sudoers
rpm -Uvh /tmp/jdk-6u17-linux-amd64.rpm
 
yum -y install compat-libstdc++-33.i386 compat-libstdc++-296.i386
wget http://ftp.riken.jp/net/apache/ant/binaries/apache-ant-1.8.0RC1-bin.tar.gz
tar xvzf apache-ant-1.8.0RC1-bin.tar.gz
mv apache-ant-1.8.0RC1 /usr/local/
ln -s /usr/local/apache-ant-1.8.0RC1 /usr/local/ant
wget http://www.oberhumer.com/opensource/lzo/download/lzo-2.03.tar.gz
tar xvzf lzo-2.03.tar.gz && cd lzo-2.03 && ./configure --enable-shared && make && make install
mv /usr/bin/ant /usr/bin/ant.org
ln -s /usr/local/ant/bin/ant /usr/bin/ant
cd ~/
 
wget http://archive.cloudera.com/redhat/cdh/cloudera-testing.repo
mv cloudera-testing.repo /etc/yum.repos.d/
yum -y install hadoop-0.20 hadoop-0.20-native hadoop-0.20-namenode hadoop-0.20-secondarynamenode \
 
hadoop-0.20-datanode hadoop-0.20-jobtracker hadoop-0.20-tasktracker  hadoop-0.20-docs
 
chkconfig hadoop-0.20-jobtracker off
chkconfig hadoop-0.20-namenode off
chkconfig hadoop-0.20-secondarynamenode off
for i in `chkconfig  --list|grep hbase|awk '{print $1}'`; do chkconfig $i off; done
 
echo "# /home/yepn/.bash_profile" > /home/yepn/.bash_profile
echo "" >> /home/yepn/.bash_profile
echo "# Get the aliases and functions" >> /home/yepn/.bash_profile
echo "if [ -f ~/.bashrc ]; then" >> /home/yepn/.bash_profile
echo "        . ~/.bashrc" >> /home/yepn/.bash_profile
echo "fi" >> /home/yepn/.bash_profile
echo "" >> /home/yepn/.bash_profile
echo "# User specific environment and startup programs" >> /home/yepn/.bash_profile
echo "export ANT_HOME=/usr/local/ant" >> /home/yepn/.bash_profile
echo "PATH=/usr/java/bin:$PATH:$HOME/bin:$ANT_HOME/bin" >> /home/yepn/.bash_profile
echo "LD_LIBRARY_PATH=/usr/local/lib" >> /home/yepn/.bash_profile
echo "JAVA_HOME=/usr/java/latest" >> /home/yepn/.bash_profile
echo "" >> /home/yepn/.bash_profile
echo "export LD_LIBRARY_PATH" >> /home/yepn/.bash_profile
echo "export JAVA_HOME" >> /home/yepn/.bash_profile
echo "export PATH" >> /home/yepn/.bash_profile
 
su - yepn
git clone git://github.com/kevinweil/hadoop-lzo.git
cd hadoop-lzo
ant compile-native tar
 
exit
cd /home/yepn/hadoop-lzo
cp build/hadoop-lzo-0.3.0/hadoop-lzo-0.3.0.jar /usr/lib/hadoop-0.20/lib
tar -cBf - -C build/hadoop-lzo-0.3.0/lib/native . | tar -xBvf - -C /usr/lib/hadoop-0.20/lib/native
 
 
/etc/init.d/hadoop-0.20-datanode start
/etc/init.d/hadoop-0.20-tasktracker start
service munin-node restart
 
 
#最後実行
 
mkdir /hdfs/data
mkdir /hdfs/local
mkdir /hdfs/logs
mkdir /hdfs/name
mkdir /hdfs/pids
mkdir /hdfs/system
mkdir /hdfs/tmp
 
chmod -R 777 /hdfs
 
cd /etc/hadoop-0.20/
tar -zxvf /tmp/conf.yepn.tgz
rm -rf /etc/alternatives/hadoop-0.20-conf
ln -s /etc/hadoop-0.20/conf.yepn /etc/alternatives/hadoop-0.20-conf
/etc/init.d/hadoop-0.20-datanode restart
/etc/init.d/hadoop-0.20-tasktracker restart
 
hadoop dfs -ls

库文件安装

git clone git://github.com/kevinweil/hadoop-lzo.git
cd hadoop-lzo
ant compile-native tar
su
 
cp build/hadoop-lzo-0.3.0/hadoop-lzo-0.3.0.jar /usr/lib/hadoop-0.20/lib
tar -cBf - -C build/hadoop-lzo-0.3.0/lib/native . | tar -xBvf - -C /usr/lib/hadoop-0.20/lib/native
 
/etc/init.d/hadoop-0.20-datanode start
/etc/init.d/hadoop-0.20-tasktracker start

mapreducer python script

#mapper.py
#!/usr/bin/env python
 
import sys
 
# input comes from STDIN (standard input)
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    # split the line into words
    words = line.split()
    # increase counters
    for word in words:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        print '%s\t%s' % (word, 1)
#reducer.py
#!/usr/bin/env python
 
from operator import itemgetter
import sys
 
# maps words to their counts
word2count = {}
 
# input comes from STDIN
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
 
    # parse the input we got from mapper.py
    word, count = line.split('\t', 1)
    # convert count (currently a string) to int
    try:
        count = int(count)
        word2count[word] = word2count.get(word, 0) + count
    except ValueError:
        # count was not a number, so silently
        # ignore/discard this line
        pass
 
# sort the words lexigraphically;
#
# this step is NOT required, we just do it so that our
# final output will look more like the official Hadoop
# word count examples
sorted_word2count = sorted(word2count.items(), key=itemgetter(0))
 
# write the results to STDOUT (standard output)
for word, count in sorted_word2count:
    print '%s\t%s'% (word, count)

并行计算

#hadoop jar /usr/lib/hadoop-0.20/contrib/streaming/hadoop-0.20.1+152-streaming.jar -file ./mapper.py \
-mapper ./mapper.py  -file ./reducer.py  -reducer ./reducer.py -input gutenberg/* \
-output getenberg-output
###输出结果
#hadoop dfs -ls getenberg-output
Found 6 items
drwxr-xr-x   - atlantis supergroup          0 2010-02-02 15:41 /user/atlantis/getenberg-output/_logs
-rw-r--r--   1 atlantis supergroup      66618 2010-02-02 15:41 /user/atlantis/getenberg-output/part-00000
-rw-r--r--   1 atlantis supergroup      68868 2010-02-02 15:41 /user/atlantis/getenberg-output/part-00001
-rw-r--r--   1 atlantis supergroup      66862 2010-02-02 15:41 /user/atlantis/getenberg-output/part-00002
-rw-r--r--   1 atlantis supergroup      68264 2010-02-02 15:41 /user/atlantis/getenberg-output/part-00003
-rw-r--r--   1 atlantis supergroup      66828 2010-02-02 15:41 /user/atlantis/getenberg-output/part-00004
 
###查看计算内容
#hadoop dfs -cat getenberg-output/part-00000
###显示结果略
/var/lib/openshift/bccd8eac1968476490eaee9ced33c7bf/app-root/runtime/repo/php/data/pages/hadoop.txt · 最后更改: 2012/09/07 09:21 (外部编辑)
到顶部
CC Attribution-Noncommercial-Share Alike 3.0 Unported
chimeric.de = chi`s home Valid CSS Driven by DokuWiki do yourself a favour and use a real browser - get firefox!! Recent changes RSS feed Valid XHTML 1.0