Saturday, 7 July 2018

Create AWS Infrastructure With Terraform | VPC | Public/Private Subnets | Internet-gateway | Security-groups | EC2-Instances

Setup VPC, Public/Private Subnets, Internet Gateway, Route Table, Security Groups, EC2 Instance With Terraform 

How To Use Terraform To Create AWS Infrastructure

Terraform Tutorial To Launch VPC Subnets EC2 Instances

# Introduction To Terraform:

Terraform is Write, Plan and Create Infrastructure As a Code. We have already covered the basics of Terraform in our previous tutorial (Blog & Video Tutorial). We recommend you to go through it once before continuing this tutorial.

# End Goal

We will be creating a complete infrastructure in a minute using Terraform which can run your whole application. Following are the components we will be creating:
  • VPC (Virtual Private Cloud)
  • Public subnet (for webserver) & Private subnet (for database server)
  • Internet Gateway
  • Route Table
  • Security Groups (for webserver and database server)
  • Key Pair
  • EC2 Instance (For webserver and database)

# Video Tutorial


# Prerequisites

  • AWS Account
  • Basic understanding or knowledge of AWS related terminology
  • IAM user access key and secret key (IAM user should have permission to create required resources).

# Create public key

  • Generate public key from private key
    ssh-keygen -y -f ~/.ssh/pemfile/mumbai.pem

# File to store global variables (variables.tf)

variable "aws_region" {
  description = "Region for the VPC"
  default = "ap-southeast-1"
}

variable "vpc_cidr" {
  description = "CIDR for the VPC"
  default = "10.0.0.0/16"
}

variable "public_subnet_cidr" {
  description = "CIDR for the public subnet"
  default = "10.0.1.0/24"
}

variable "private_subnet_cidr" {
  description = "CIDR for the private subnet"
  default = "10.0.2.0/24"
}

variable "ami" {
  description = "Amazon Linux AMI"
  default = "ami-14c5486b"
}

variable "key_path" {
  description = "SSH Public Key path"
  default = "public_key"
}

# Set provider to AWS (provider.tf)

# Define AWS as our provider
provider "aws" {
  region = "${var.aws_region}"
}

# Creating VPC, Subnets, Internet Gateway, Route Table, Security Groups (vpc.tf)

# Define our VPC
resource "aws_vpc" "default" {
  cidr_block = "${var.vpc_cidr}"
  enable_dns_hostnames = true

  tags {
    Name = "test-vpc"
  }
}

# Define the public subnet
resource "aws_subnet" "public-subnet" {
  vpc_id = "${aws_vpc.default.id}"
  cidr_block = "${var.public_subnet_cidr}"
  availability_zone = "ap-southeast-1"

  tags {
    Name = "Web Public Subnet"
  }
}

# Define the private subnet
resource "aws_subnet" "private-subnet" {
  vpc_id = "${aws_vpc.default.id}"
  cidr_block = "${var.private_subnet_cidr}"
  availability_zone = "us-east-1b"

  tags {
    Name = "Database Private Subnet"
  }
}

# Define the internet gateway
resource "aws_internet_gateway" "gw" {
  vpc_id = "${aws_vpc.default.id}"

  tags {
    Name = "VPC IGW"
  }
}

# Define the route table
resource "aws_route_table" "web-public-rt" {
  vpc_id = "${aws_vpc.default.id}"

  route {
    cidr_block = "0.0.0.0/0"
    gateway_id = "${aws_internet_gateway.gw.id}"
  }

  tags {
    Name = "Public Subnet RT"
  }
}

# Assign the route table to the public Subnet
resource "aws_route_table_association" "web-public-rt" {
  subnet_id = "${aws_subnet.public-subnet.id}"
  route_table_id = "${aws_route_table.web-public-rt.id}"
}

# Define the security group for public subnet
resource "aws_security_group" "sgweb" {
  name = "vpc_test_web"
  description = "Allow incoming HTTP connections & SSH access"

  ingress {
    from_port = 80
    to_port = 80
    protocol = "tcp"
    cidr_blocks = ["0.0.0.0/0"]
  }

  ingress {
    from_port = 443
    to_port = 443
    protocol = "tcp"
    cidr_blocks = ["0.0.0.0/0"]
  }

  ingress {
    from_port = -1
    to_port = -1
    protocol = "icmp"
    cidr_blocks = ["0.0.0.0/0"]
  }

  ingress {
    from_port = 22
    to_port = 22
    protocol = "tcp"
    cidr_blocks =  ["0.0.0.0/0"]
  }

  egress {
    from_port       = 0
    to_port         = 0
    protocol        = "-1"
    cidr_blocks     = ["0.0.0.0/0"]
  }

  vpc_id="${aws_vpc.default.id}"

  tags {
    Name = "Web Server SG"
  }
}

# Define the security group for private subnet
resource "aws_security_group" "sgdb"{
  name = "sg_test_web"
  description = "Allow traffic from public subnet"

  ingress {
    from_port = 3306
    to_port = 3306
    protocol = "tcp"
    cidr_blocks = ["${var.public_subnet_cidr}"]
  }

  ingress {
    from_port = -1
    to_port = -1
    protocol = "icmp"
    cidr_blocks = ["${var.public_subnet_cidr}"]
  }

  ingress {
    from_port = 22
    to_port = 22
    protocol = "tcp"
    cidr_blocks = ["${var.public_subnet_cidr}"]
  }

  vpc_id = "${aws_vpc.default.id}"

  tags {
    Name = "DB SG"
  }
}

# Creating Key-pair, Instances (resources.tf)

# Define SSH key pair for our instances
resource "aws_key_pair" "default" {
  key_name = "mumbai"
  public_key = "${file("${var.key_path}")}"
}

# Define webserver inside the public subnet
resource "aws_instance" "wb" {
   ami  = "${var.ami}"
   instance_type = "t1.micro"
   key_name = "${aws_key_pair.default.id}"
   subnet_id = "${aws_subnet.public-subnet.id}"
   vpc_security_group_ids = ["${aws_security_group.sgweb.id}"]
   associate_public_ip_address = true
   source_dest_check = false
   user_data = "${file("userdata.sh")}"1

  tags {
    Name = "webserver"
  }
}

# Define database inside the private subnet
resource "aws_instance" "db" {
   ami  = "${var.ami}"
   instance_type = "t1.micro"
   key_name = "${aws_key_pair.default.id}"
   subnet_id = "${aws_subnet.private-subnet.id}"
   vpc_security_group_ids = ["${aws_security_group.sgdb.id}"]
   source_dest_check = false

  tags {
    Name = "database"
  }
}

# Userdata for webserver instance (userdata.sh)


#!/bin/sh

set -x
# output log of userdata to /var/log/user-data.log
exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1
yum install -y httpd
service httpd start
chkonfig httpd on
echo "<html><h1>Hello from Appychip</h2></html>" > /var/www/html/index.html

Finally after creating these files in a folder "terraform-example", you can run "terraform plan" inside terraform-example directory to see what all resources will be created. This will simply tell you what all resources would be created instead of actually creating them. Run "terraform apply" to actually create these resources.

Sunday, 27 May 2018

Automated EBS Volume Snapshot Using AWS Lambda Based On EC2 Tags

Lambda Function To Take Snapshot Of EBS Volume Based On EC2 Tags

Take Snapshot Of Secondary Volumes Of Instance Having particular Tag

Create AWS Lambda Function For EBS Snapshot Having Certain Tags On EC2 Instance

It is often required to take snapshot of EBS volume for backup as there could be important data residing on those volumes. This can be achieved by run some script on ec2 instance as a cron. But its very unreliable to run a script on some ec2 instance as the script may fail due to any reason and we won't come to know about it.


To solve this problem we will use a python script on AWS Lambda which can scheduled to run at required interval. You can also set cloudwatch alarm to trigger a mail if the lambda function return some error.

Note: We are going to take Snapshot of only secondary volumes as assuming for most of the databases, data resides on secondary disk instead of root volume.

# End Goal

  • Take Snapshot of only secondary volume
  • Snapshot should be taken of a volume attached to an instance having a tag "Backup:true"

# Prerequisites

  • IAM role (volume-snapshot-role) having policy to read ec2 instance details and to create snapshot.
  • Instance with secondary volume.

# Video Tutorial


# IAM Role

IAM role should have following policy attached to it:
  1.  AWS Managed Policy: AmazonEC2ReadOnlyAccess
  2. volume-snapshot-policy:
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "logs:CreateLogGroup",
                "logs:CreateLogStream",
                "logs:PutLogEvents"
            ],
            "Resource": "arn:aws:logs:*:*:*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "ec2:DescribeVolumeStatus",
                "ec2:DescribeVolumes",
                "ec2:DescribeSnapshots",
                "ec2:CreateSnapshot",
                "ec2:CreateTags"
            ],
            "Resource": "*"
        }
    ]
}

# Lambda Function

Following is the lambda function snippet:


from __future__ import print_function
import boto3
from boto3 import resource

'''
    This script takes snapshot of volumes which are secondary volumes and are attached to instance having "Backup:true" tag
'''

SNAPSHOT_TAGS = {'createdby': 'lambda'}        # Dictionary of tags to apply to the created snapshots
TAG_FILTERS = [{'Name': 'tag:Backup','Values': ['true']}]   # Tags on which ec2 instance should get filter
REGION = "ap-south-1"                                       # AWS region in which the volumes exist


def take_snapshots(volume, tags_kwargs):
    snapshot = volume.create_snapshot(
           Description='created by lambda'
           )
    #snapshot = ec2.create_snapshot(VolumeId=volume,Description='Created by Lambda function ebs-snapshots')
    print(snapshot)
    if tags_kwargs:
        snapshot.create_tags(**tags_kwargs)

def process_tags():
    tags = []
    tags_kwargs = {}
    # AWS allows 10 tags per resource
    if SNAPSHOT_TAGS and len(SNAPSHOT_TAGS) <= 10:
        for key, value in SNAPSHOT_TAGS.items():
            tags.append({'Key': key, 'Value': value})
        tags_kwargs['Tags'] = tags
    return tags_kwargs

def print_summary(counts):
    print("\nSUMMARY:\n")
    print("Snapshots created:  {}{}".format(counts,""))
    print("-------------------------------------------\n")


def lambda_handler(event, context):

    snap_count = 0

    # List of devices that should be consider to take snapshot. Root volume is excluded here i.e. /dev/xvda or /dev/sda
    DEVICES = ['/dev/sdb', '/dev/sdc', '/dev/sde', '/dev/sdf', '/dev/sdg', '/dev/sdh', '/dev/sdi', '/dev/sdj', '/dev/sdk', '/dev/sdl', '/dev/sdm', '/dev/sdn', '/dev/sdo', '/dev/sdp', '/dev/sdq', '/dev/sdr', '/dev/sds', '/dev/sdt', '/dev/sdu', '/dev/sdv', '/dev/sdw', '/dev/sdx', '/dev/sdy', '/dev/sdz']

    ec2 = resource("ec2", region_name=REGION)
    ec2_client = boto3.client('ec2')

    # Get information for all instances with tag "Backup:true"
    instances = ec2.instances.filter(Filters=TAG_FILTERS)

    # Filter all instances that have are attached to ec2 instances in "instances"
    for instance in instances:
        ec2_instance = ec2.Instance(instance.id)
        print("for instance:", ec2_instance )
        volumes = ec2.volumes.filter(Filters=[{'Name': 'attachment.instance-id', 'Values': [ec2_instance.id]}, {'Name': 'attachment.device', 'Values': DEVICES} ])
        #print(volumes)
  
        for tags in instance.tags:
            #print("inside for loop of instance.tags")
            if tags["Key"] == 'Name':
                SNAPSHOT_TAGS['Name'] = tags["Value"]
  
        # Take Backup
        for volume in volumes:
            tags_kwargs = process_tags()

            print("Taking snapshot..")
            take_snapshots(ec2.Volume(volume.id), tags_kwargs)
            snap_count += 1
            
    print("at last summary")
    print_summary(snap_count)

Saturday, 12 May 2018

Setup Kubernetes Cluster locally | Minikube

Kubernetes Cluster for Local Development with Minikube

Minikube Installation

Deploy Nginx WebServer In Kubernets Cluster Setup via Minikube

From Official Documentation:

"Minikube is a tool that makes it easy to run Kubernetes locally. Minikube runs a single-node Kubernetes cluster inside a VM on your laptop for users looking to try out Kubernetes or develop with it day-to-day."

# Minikube Cluster supports

  • Dashboard to manage cluster.
  • ConfigMaps to store configurations.
  • Secrets to store secrets related to your application.
  • NodePorts
  • Various Container Runtime like: Docker, rkt etc
  • Container Network Interface (CNI)
  • Ingress

# End Goal

  • Minikube installed.
  • Little hands on with Minikube commands
  • Deploy Nginx webserver in kubernetes cluster formed by minikube

# Prerequisites

  • Hypervisor installed. For Linux: VirtulBox or KVM, for OSX: VirtualBox or HyperKit.
  • Kubectl installed.

# Installation Steps

  1.  For Linux:

    curl -Lo minikube https://storage.googleapis.com/minikube/releases/v0.26.1/minikube-linux-amd64 && chmod +x minikube && sudo mv minikube /usr/local/bin/
    

  2. For OSX:

    curl -Lo minikube https://storage.googleapis.com/minikube/releases/v0.26.1/minikube-darwin-amd64 && chmod +x minikube && sudo mv minikube /usr/local/bin/
    
Minikube is ready to use.

# Minikube Useful Commands

1. To start Minikube Cluster

$ minikube start
Starting local Kubernetes cluster...
Running pre-create checks...
Creating machine...
Starting local Kubernetes cluster...

2. To stop Minikube Cluster

$ minikube stop
Stopping local Kubernetes cluster...
Stopping "minikube"...

3. To check Status

$ minikube status
minikube: Running
cluster: Running
kubectl: Correctly Configured: pointing to minikube-vm at 192.168.99.100

4. To launch dashboard

$ minikube dashboard
Opening kubernetes dashboard in default browser...

# Lets try deploying Nginx Webserver on Minikube Cluster

1. Create deployment for nginx server

$ kubectl run nginx --image=nginx:latest --port=80
deployment.apps "nginx" created

2. Create service to expose the nginx web server

$ kubectl expose deployment nginx --type=NodePort
service "nginx" exposed
3. Check Pod status

$ kubectl get pods
NAME                     READY     STATUS    RESTARTS   AGE
nginx-7dc954dcb5-6qqcz   1/1       Running   0          18s

4. Get endpoint and port on which nginx service is running, so that it can be accessed.

$ minikube service nginx --url
http://192.168.99.100:32688

Open above url in browser, you will see, nginx webserver up and running.

Setup Cronjobs in Kubernetes Cluster

K8s | Cron Jobs


Cron jobs are tasks that run periodically. One can run any script in cron jobs that is required to run time to time, without any manual intervention.

In linux, cron comes as a utility that you can use for scheduling the jobs. Here we will achieve same goal without depending on any virtual machine or its operating system. we will run cron job in a Kubernetes cluster.

# Advantages:

  • No need to run dedicated vm or service for scheduling jobs.
  • High availability for job will be taken care by Kubernetes.
  • Easy and one time setup.
  • Portability
Every time kubernetes creates a pod to run a cron job. Pod can be scheduled on any worker node of the cluster, depending on the resource availability. Pod name will always be different from your cron job name.

# Prerequisites:

  • Kubernetes cluster with version >=1.8

# Process to setup:

Cron jobs can be setup using YML config or using kubectl run command line utility.

1. Using kubectl run command line utility

kubectl run hello --schedule="*/1 * * * *" --restart=OnFailure --image=busybox -- /bin/sh -c "date; echo Welcome to the Kubernetes Cron Jobs"

Parameters:
  • "hello" is a cron job name.
  • --schedule defines the frequency through which cron job should run
  • --resttart options tells to restart job automatically on which conditions. In above example, job will restart if its execution fails.
  • --image option specify the docker image to be used.
  • and last arguments gets pass to the docker images as a command to run.
2. Using YML Configuration file

Below is the example yml file, save it as a cronjob.yaml:
apiVersion: batch/v1beta1
kind: CronJob
metadata:
  name: hello
spec:
  schedule: "*/1 * * * *"
  jobTemplate:
    spec:
      template:
        spec:
          containers:
          - name: hello
            image: busybox
            args:
            - /bin/sh
            - -c
            - date; echo Welcome to the Kubernetes cluster
          restartPolicy: OnFailure

Create cron job using above yml file:

kubectl create -f ./cronjob.yaml

To view status of cron jobs:

kubectl get cronjob hello

where, "hello" is a cronjob name.

To delete Cron job:

kubectl delete cronjob hello


Saturday, 5 May 2018

Kibana Error 403 Forbidden: blocked by: [FORBIDDEN/12/index read-only / allow delete (api)];

flood stage disk watermark [95%] exceeded, all indices on this node will marked read-only

You may face this error if you disk space watermark level has reached which will mark all indices read-only. The indexes once gets locked, they never get unlocked even if you free up some disk space. The fix has to be done manually by running the following command:


curl -XPUT -H "Content-Type: application/json" https://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'

Debugging Elasticsearch Cluster Going RED Or Yellow

How To Get Elasticsearch Cluster Back To Green From RED Or Yellow


Once we had our elastic search cluster going RED. Everything stopped working, it was a panic situation. We didn't knew what went wrong and it was late night. We started debugging and followed some steps to make out what went wrong. So In this post, we will be explaining the steps we took to recover the cluster to make you familiar with Elasticsearch debugging. This debugging process is not universal to debug any sort of issue related to ES but yeah, this can surely give you an idea.


Following are the steps we tried to debug and get the cluster back to normal:

First analyse cluster health by following command:

curl localhost:9200/_cat/health?v
This will tell you if cluster is red/yellow/green, how many unassigned/initialising/relocating shards and are there any pending tasks.
In our case the cluster health was RED and there were 2000 unassigned and some initialising shards which were not reducing over time which essentially means cluster was not recovering at all.
Reason for unassigned shards could be different like not enough disk space, less resource, bulk request etc. The aim was to now find out why there are unassigned shards and to reduce them.
Less disk space could result into some unassigned shards. So check overall cluster disk space which was hovering around 60% in our case, this means disk space is not an issue.


Also check node level disk space as one or more node might be at 85% level which is the default watermark level per node at which ES stops allocating shards on that node. You can check node level disk utilisation by the following command:

http://localhost:9200/_cat/allocation?v


Disk utilisation on each node was ok.
Now check Master node logs:
Tail current master node logs. If you don’t know which node is master node then run the following command, the node with asterisk * is the current elected master node
curl localhost:9500/_cat/nodes?v
In logs of master node, following errors/exception were noted:
master node logs
[2018-04-02 07:08:10,088][DEBUG][action.admin.indices.create] [ip-192-168-6-66] [myindex-2030-11-29] failed to create
org.elasticsearch.cluster.metadata.ProcessClusterEventTimeoutException: failed to process cluster event (create-index [myindex-2030-11-29], cause [auto(bulk api)]) within 1m
        at org.elasticsearch.cluster.service.InternalClusterService$2$1.run(InternalClusterService.java:278)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)
This clearly tells us that its failing to create indexes within fixed duration and the index creation is due to bulk api request. Also, the index getting created are future dated, which is not desired.
Next thing was to look at which all indexes have gone making the whole cluster state RED. Run following command to know those indexes:
curl -XGET 192.168.6.106:9200/_cat/shards | grep UNASSIGNED
or to know the reason as well for unassigned shards run following command:
curl -XGET localhost:9200/_cat/shards?h=index,shard,prirep,state,unassigned.reason| grep UNASSIGNED
One general reason of unassigned shards could be "NODE LEFT". Master node may think a node is out of cluster if it doesn't acknowledge its presence in a specific time. Data node may fail to acknowledge due to many reasons like - process got stopped, stop the world GC happening frequently and taking too much of time. These things you can check on that particular data node logs.
In our case following indexes were unassigned and red (only few indexes are shown as example). These are all future dated


myindex-2021-06-30 4 r UNASSIGNED
myindex-2021-06-30 4 r UNASSIGNED
myindex-2021-06-30 4 p UNASSIGNED
myindex-2021-06-30 0 r UNASSIGNED
myindex-2021-06-30 0 r UNASSIGNED
myindex-2021-06-30 0 p UNASSIGNED
myindex-2021-06-30 3 r UNASSIGNED
myindex-2021-06-30 3 r UNASSIGNED
myindex-2021-06-30 3 p UNASSIGNED
myindex-2021-06-30 1 r UNASSIGNED
myindex-2021-06-30 1 r UNASSIGNED
We were pretty much sure that some client has uploaded data with csv with future dated events, hence it results in heavy index creations.


Now to recover the state, we stopped the writes on elasticsearch (since we are using kafka, we can afford stopping writes). If you can't stop the writes, just disable allocation.
After stopping writes, we disabled the allocation on cluster with the following command:
curl -XPUT localhost:9200/_cluster/settings -d '{"transient":{"cluster.routing.allocation.enable":"none"}}'
We also purged the cluster cache to clean up the memory and make the things works faster with the following command:
curl -XPOST localhost:9200/_cache/clear
Now the unassigned shards were getting into initialised state and pending tasks were reducing slowly.
Although unassigned one can be routed to a healthy node in case the cluster went to yellow because of an unhealthy node. You can use the following command to reroute:



curl -XPOST 'http://localhost:9200/_cluster/reroute' -d '
{
   "commands" : [
      {
          "allocate" : {
              "index" : "myindex-2021-06-30", "shard" : 1,
              "node": "ip-192-168-6-165", "allow_primary": "true"
          }
      }
   ]
}'


You can also just trigger the routing of unassigned shards by the following command without specifying the node:

curl -XPOST localhost:9200/_cluster/reroute?retry_failed

This rerouting was not needed in our case as we were flexible to delete RED unassigned shards as we didn't wanted future dated events to be recorded.
To delete RED unassigned shards, run the following command:


#!/bin/bash
IFS=$'\n'
for line in $(curl -s 'localhost:9200/_cat/shards' | fgrep UNASSIGNED); do
 INDEX=$(echo $line | (awk '{print $1}'))
 SHARD=$(echo $line | (awk '{print $2}'))
 #echo $INDEX
 curl -XDELETE localhost:9200/$INDEX
done
When all unassigned shards were assigned and pending tasks reached 0, the cluster also became green. Then we restarted the allocation with the following command:
curl -XPUT 192.168.6.106:9200/_cluster/settings -d '{"transient":{"cluster.routing.allocation.enable":"all"}}'
After this we started the application to write to elasticsearch cluster.
To make sure this doesn't happens again. A validation on application side was put to restrict future dated events to pass to elasticsearch.

Sunday, 8 April 2018

Run Elasticsearch 6.2 Cluster On AWS Spot Fleet

How To Run Elasticsearch 6.2 Cluster In Docker Using Spot Instances In AWS
Dockerising Elasticsearch 6.2

Elasticsearch is a very useful databases capable of handling enormous amount of data. Its capacity and performance is great but comes with a cost. If you are holding TBs of data in an Elasticsearch cluster then you might end up having around 10 data nodes, 3 master nodes and 1-3 client nodes. These numbers will cost you roughly around $7K per month if you have config similar to 30GB RAM on data nodes.

What if you can reduce this cost to less than $2K per month? Yes, you can save a lot if you use AWS spot instances.

Here we will setup a cluster having 3 data nodes of 16 GB RAM each and a master node having 8GB of RAM. We are keeping number of nodes limited just for the demo but you can grow this as per your requirement or use-case. So, let's get started...

Steps to be followed:

  • Create security group elasticsearch-sg
  • Create Elasticsearch Config (elasticsearch.yml and jvm.options)
  • Create S3 bucket to hold Elasticsearch config files
  • Create IAM Policy to access bucket
  • Create IAM Role and assign bucket access policy to it
  • Create Base Image (AMI) from which we will be spawning master node and data node
  • Create On-demand Master Node
  • Create Data nodes on Spot Fleet 

#1 Create security group "elasticsearch-sg"

#2 Elasticsearch Configuration files

elasticsearch_master.yml for master node:

cluster.name: spotes

path.data: /usr/share/elasticsearch/data

network.host: 0.0.0.0

http.port: 9200

node.master: true

node.data: false

node.name: "nodename"

transport.tcp.compress: true

bootstrap.memory_lock: true

discovery.zen.minimum_master_nodes: 1

discovery.zen.ping.unicast.hosts: ['es1.xyz.vpc']

thread_pool.bulk.queue_size: 500

elasticsearch_data.yml file for data node:

cluster.name: spotes

path.data: /usr/share/elasticsearch/data

network.host: 0.0.0.0

http.port: 9200

node.master: false

node.data: true

node.name: "nodename"

bootstrap.memory_lock: true

transport.tcp.compress: true

discovery.zen.minimum_master_nodes: 1

discovery.zen.ping.unicast.hosts: ['es1.xyz.vpc']

thread_pool.bulk.queue_size: 500

master.jvm.options file for master node:

-Xms4g
-Xmx4g
-XX:+UseConcMarkSweepGC
-XX:CMSInitiatingOccupancyFraction=75
-XX:+UseCMSInitiatingOccupancyOnly
-XX:+DisableExplicitGC
-XX:+AlwaysPreTouch
-server
-Xss1m
-Djava.awt.headless=true
-Dfile.encoding=UTF-8
-Djna.nosys=true
-Djdk.io.permissionsUseCanonicalPath=true
-Dio.netty.noUnsafe=true
-Dio.netty.noKeySetOptimization=true
-Dio.netty.recycler.maxCapacityPerThread=0
-Dlog4j.shutdownHookEnabled=false
-Dlog4j2.disable.jmx=true
-Dlog4j.skipJansi=true
-XX:+HeapDumpOnOutOfMemoryError

data.jvm.options file for data node

-Xms8g
-Xmx8g
-XX:+UseConcMarkSweepGC
-XX:CMSInitiatingOccupancyFraction=75
-XX:+UseCMSInitiatingOccupancyOnly
-XX:+DisableExplicitGC
-XX:+AlwaysPreTouch
-server
-Xss1m
-Djava.awt.headless=true
-Dfile.encoding=UTF-8
-Djna.nosys=true
-Djdk.io.permissionsUseCanonicalPath=true
-Dio.netty.noUnsafe=true
-Dio.netty.noKeySetOptimization=true
-Dio.netty.recycler.maxCapacityPerThread=0
-Dlog4j.shutdownHookEnabled=false
-Dlog4j2.disable.jmx=true
-Dlog4j.skipJansi=true
-XX:+HeapDumpOnOutOfMemoryError

#3 Create S3 bucket to hold Elasticsearch config files

Create a bucket name es-configurations and upload all configuration files we created above



#4 Create IAM Policy to access bucket

Create following IAM policy (es-configurations-bucket-access)

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Action": [
                "s3:GetObject"
            ],
            "Effect": "Allow",
            "Resource": "arn:aws:s3:::es-configurations/*"
        }
    ]
}


#5 Create IAM Role and assign bucket access policy to it

Create an IAM role "elasticsearch-role" and attach above policy to it.

#6 Create Base Image (AMI) from which we will be spawning master node and data node

First we will launch an instance in which we will install docker, aws-cli and download elasticsearch docker image. After installing these basic stuff we will create an AMI from it which will be used to launch master and data node.

Now go ahead launching an instance by providing the following userdata to it:

#!/bin/bash

# output log of userdata to /var/log/user-data.log
exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1

# Install awscli
apt-get update
apt install awscli -y

# Set max_map_count
echo 262144 | sudo tee /proc/sys/vm/max_map_count

# Install docker
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
apt-get update
apt-cache policy docker-ce
apt-get install -y docker-ce
service docker restart

# Get official elasticsearch docker image
docker pull docker.elastic.co/elasticsearch/elasticsearch:6.2.3

# Create /etc/elasticsearch directory to hold elasticsearch config files like elasticsearch.yml and jvm.options
mkdir -p /etc/elasticsearch


When you are done running the above script, create an AMI from the current instance.

#7 Create On-demand Master Node

Create an on-demand instance of type having 8GB of memory as we are giving 4GB of HEAP and provide the following userdata to it:

#!/bin/bash
 
set -x
# output log of userdata to /var/log/user-data.log
exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1
 
aws s3 cp s3://es-configurations/elasticsearch_master.yml /etc/elasticsearch/elasticsearch.yml --region ap-south-1
aws s3 cp s3://es-configurations/master.jvm.options /etc/elasticsearch/jvm.options --region ap-south-1
 
sed -i -e "s/nodename/${HOSTNAME}/g" /etc/elasticsearch/elasticsearch.yml
 
mkdir -p /vol/es
 
chown -R 1000:1000 /vol
chown -R 1000:1000 /etc/elasticsearch
 
sysctl -w vm.max_map_count=262144
 
#start docker container
docker run --net=host -d -p 9200:9200 -e "xpack.security.enabled=false" --restart unless-stopped -v /vol/es:/usr/share/elasticsearch/data -v /etc/elasticsearch/jvm.options:/usr/share/elasticsearch/config/jvm.options -v /etc/elasticsearch/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml  --ulimit nofile=65536:65536 --ulimit memlock=-1:-1 docker.elastic.co/elasticsearch/elasticsearch:6.2.3


After launching master node. Make a route53 entry for es1.xyz.vpc with private IP or any domain you want to use for your master node.

#8 Create Data nodes on Spot Fleet 

Now we will, create spot fleet request to launch data nodes as spot instance. Go to "Spot Requests" in AWS ec2 dashboard and click on "Request Spot Instance" button:



  • Select "Request and Maintain", set "total target capacity to 3" as we will be launching 3 data nodes.
  • Select the AMI we created above. Choose any instance type having 16GB of RAM (as we are setting HEAP to 8GB).
  • Select required VPC, AZ.
  • Add additional disk of size 50GB (This could differ as per your requirement)
  • You can provide health check, monitoring and other options.
  • Provide a security group (elasticsearch-sg in our case)
  • Give a key-pair name which can be used to SSH
  • Select "elasticsearch-role" in "IAM Instance Profile"
  • Provide the following userdata:
#!/bin/bash
 
set -x
# output log of userdata to /var/log/user-data.log
exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1
 
aws s3 cp s3://es-configurations/elasticsearch_data.yml /etc/elasticsearch/elasticsearch.yml --region ap-south-1
aws s3 cp s3://es-configurations/data.jvm.options /etc/elasticsearch/jvm.options --region ap-south-1

 
sed -i -e "s/nodename/${HOSTNAME}/g" /etc/elasticsearch/elasticsearch.yml
 
mkfs.xfs /dev/xvdba
mkdir -p /vol/es
mount /dev/xvdba /vol/es
 
chown -R 1000:1000 /vol
chown -R 1000:1000 /etc/elasticsearch
 
sysctl -w vm.max_map_count=262144
 
#start docker container
docker run --net=host -d -p 9200:9200 -e "xpack.security.enabled=false" --restart unless-stopped -v /vol/es:/usr/share/elasticsearch/data -v /etc/elasticsearch/jvm.options:/usr/share/elasticsearch/config/jvm.options -v /etc/elasticsearch/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml  --ulimit nofile=65536:65536 --ulimit memlock=-1:-1 docker.elastic.co/elasticsearch/elasticsearch:6.2.3
 

You can leave other settings to default. Click on "Launch", this will create a spot request and will launch three nodes which will eventually join the cluster.
After the nodes are ready, go to master node and make a curl request to check if nodes are in the cluster:


curl localhost:9200/_cat/nodes?v

This will show the list of all nodes.

 

Copyright @ 2013 Appychip.

Designed by Appychip & YouTube Channel