|
1 | 1 | #!/bin/bash |
| 2 | +shopt -s expand_aliases |
2 | 3 | # The following script will help gather row size estimates for each table in |
3 | 4 | # a cluster. |
4 | 5 | # It will query each table a fixed number of rows at a low query rate not to |
|
13 | 14 | # The script takes the same parameters as cqlsh to connect to cassandra |
14 | 15 | # example: ./row-size-sampler.sh cassandra.us-east-1.amazonaws.com 9142 -u "sampleuser" -p "samplepass" --ssl |
15 | 16 |
|
| 17 | +# check if the cqlsh-expansion is installed, then if cqlsh installed, then check local file |
| 18 | +if [ -x "$(command -v cqlsh-expansion)" ]; then |
| 19 | + echo 'using installed cqlsh-expansion' |
| 20 | + alias kqlsh='cqlsh-expansion' |
| 21 | +elif [ -x "$(command -v cqlsh)" ]; then |
| 22 | + echo 'using installed cqlsh' |
| 23 | + alias kqlsh='cqlsh' |
| 24 | +elif [ -e cqlsh ]; then |
| 25 | + echo 'using local cqlsh' |
| 26 | + alias kqlsh='./cqlsh' |
| 27 | +else |
| 28 | + echo 'cqlsh not found' |
| 29 | + exit 1 |
| 30 | +fi |
| 31 | + |
| 32 | +echo 'starting...' |
| 33 | + |
16 | 34 | SYSTEMKEYSPACEFILTER='system\|system_schema\|system_traces\|system_auth\|dse_auth\|dse_security\|dse_leases\|system_distributed\|dse_perf\|dse_system\|OpsCenter\|cfs\|cfs_archive\|dse_leases\|dsefs\|HiveMetaStore\|spark_system' |
17 | 35 |
|
18 | 36 | TABLEFILTER='^-\|^table_name\|(\|)' |
19 | 37 |
|
20 | | -keyspaces=$(echo desc keyspaces | ./cqlsh $@ | xargs -n1 echo | grep -v $SYSTEMKEYSPACEFILTER) |
| 38 | +keyspaces=$(echo desc keyspaces | kqlsh $@ | xargs -n1 echo | grep -v $SYSTEMKEYSPACEFILTER) |
21 | 39 | for ks in $keyspaces; do |
22 | | - tables=$(echo "SELECT table_name FROM system_schema.tables WHERE keyspace_name='$ks';" | ./cqlsh $@ | xargs -n1 echo | grep -v $TABLEFILTER) |
| 40 | + tables=$(echo "SELECT table_name FROM system_schema.tables WHERE keyspace_name='$ks';" | kqlsh $@ | xargs -n1 echo | grep -v $TABLEFILTER) |
23 | 41 | for tb in $tables; do |
24 | | - ./cqlsh $@ -e "CONSISTENCY LOCAL_ONE; PAGING 100; SELECT * FROM \"$ks\".\"$tb\" LIMIT 30000;" | grep -v '\[json\]\|rows)\|-----\|^$' | tr -d ' ' | awk -v keyspace=$ks -v table=$tb -F'|' 'BEGIN {columns=0; numSamples=30000; kilobyte=1024; min = "NaN"; max = -1; lines = 1; } { if(NR==2){columns=NF;} if(NR>2){thislen=length($0)+107; total+=thislen; squares+=thislen^2; lines+=1; avg=total/lines; min = (thislen<min ? thislen : min); max = (thislen>max ? thislen : max) }} NR==numSamples {exit} END { printf("%s.%s = { lines: %d, columns: %d, average: %d bytes, stdev: %d bytes, min: %d bytes, max: %d bytes}\n", keyspace, table, lines, columns, avg, sqrt(squares/lines - (avg^2)), min, max); }' >> row-size-estimates.txt 2>&1 |
25 | | - ./cqlsh $@ -e "DESCRIBE \"$ks\".\"$tb\";" | grep -i blob | while read line; do printf "\t...this table contains a BLOB type, if the majority of row size is from the BLOB, divide the result of the row size in half" ; done |
| 42 | + kqlsh $@ -e "CONSISTENCY LOCAL_ONE; PAGING 100; SELECT * FROM \"$ks\".\"$tb\" LIMIT 30000;" | grep -v '\[json\]\|rows)\|-----\|^$' | tr -d ' ' | awk -v keyspace=$ks -v table=$tb -F'|' 'BEGIN {columns=0; numSamples=30000; kilobyte=1024; min = "NaN"; max = -1; lines = 1; } { if(NR==2){columns=NF;} if(NR>2){thislen=length($0)+107; total+=thislen; squares+=thislen^2; lines+=1; avg=total/lines; min = (thislen<min ? thislen : min); max = (thislen>max ? thislen : max) }} NR==numSamples {exit} END { printf("%s.%s = { lines: %d, columns: %d, average: %d bytes, stdev: %d bytes, min: %d bytes, max: %d bytes}\n", keyspace, table, lines, columns, avg, sqrt(squares/lines - (avg^2)), min, max); }' |
| 43 | + kqlsh $@ -e "DESCRIBE \"$ks\".\"$tb\";" | grep -i blob | while read line; do printf "\t...\"$ks\".\"$tb\" contains a BLOB type, if the majority of row size is from the BLOB, then divide the estimate in half" ; done |
26 | 44 | done |
27 | 45 | done |
0 commit comments