The following script changes start and end dates of several queries saved as files in the folder /app/queries
. It is used by executing the script and passing two date strings as arguments, for example:
./config_query.sh 2022-03-28 2022-03-30
This would change all start dates of all queries in the folder /app/queries
with the first parameter and all end dates with the second parameter.
I would like to get some feedback concerning the readability of the script. Especially the sed commands are very hard to read, which makes the script hard to maintain and extend. Is there a better way to change the start and end dates of the queries?
This is the script:
#!/usr/bin/env bash
#: Title : config_query.sh
#: Date : 2022-04-06
#: Version : 1.0
#: Description : Change start and end dates of queries to values of parameters $1 and $2.
#: Options : None
# Note: The script assumes that both date strings are valid dates in format YYYY-mm-dd.
# Date format validation is performed by the script.
# Date value validation is not performed by the script, i.e. valid dates have to be passed to generate consistent queries.
ROOT="/app"
QUERIES="${ROOT}/queries"
arr=( "${QUERIES}/leads.flux" "${QUERIES}/subs.flux" "${QUERIES}/gads.sql" "${QUERIES}/ms.sql" "${QUERIES}/li.sql" )
# Check number of arguments passed to script:
if [ "$#" -ne 2 ]; then
echo "Illegal number of arguments. Program expects two date strings (YYYY-mm-dd)."
exit $?
fi
START_DATE=$1
END_DATE=$2
# Check if start and end are formatted correctly:
for date in $START_DATE $END_DATE
do
format="^[0-9]{4}-[0-9]{2}-[0-9]{2}$"
if ! [[ $date =~ $format ]]
then
echo "Date $date is incorrectly formatted (not YYYY-mm-dd)."
exit $?
fi
done
START_DATE_DECR=$(date '+%Y-%m-%d' -d "$START_DATE - 1 day")
START_DATE_INCR=$(date '+%Y-%m-%d' -d "$START_DATE + 1 day")
END_DATE_INCR=$(date '+%Y-%m-%d' -d "$END_DATE + 1 day")
# Replaces 'start' and 'stop' in Influx leads query:
sed -i -E "s/start: .{4}-.{2}-.{2}T00:00:00.000Z, stop: .{4}-.{2}-.{2}/start: ${START_DATE}T00:00:00.000Z, stop: ${END_DATE}/g" "${arr[0]}"
# Replaces 'start' and 'stop' in Influx subs query:
sed -i -E "s/start: .{4}-.{2}-.{2}T00:00:00.000Z, stop: .{4}-.{2}-.{2}/start: ${START_DATE_DECR}T00:00:00.000Z, stop: ${END_DATE_INCR}/g" "${arr[1]}"
sed -i -E "s/start: .{4}-.{2}-.{2}T00:00:00.000Z, stop: .{4}-.{2}-.{2}.*delete.first.row/start: ${START_DATE_INCR}T00:00:00.000Z, stop: ${END_DATE_INCR}T00:00:00.000Z) \/\/ delete first row/g" "${arr[1]}"
# Replaces 'START_DATE' and 'END_DATE' in Bigquery queries:
for ((i=2; i<${#arr[@]}; i++))
do
sed -i -E "s/\('.{4}-.{2}-.{2}', DATE_SUB\('.{4}-.{2}-.{2}',/\('$START_DATE', DATE_SUB\('$END_DATE',/g" "${arr[$i]}";
sed -i -E "s/>= '.{4}-.{2}-.{2}'/>= '$START_DATE'/g" "${arr[$i]}";
sed -i -E "s/< '.{4}-.{2}-.{2}'/< '$END_DATE'/g" "${arr[$i]}";
done
These are the queries as a reference (I don't seek feedback for the queries, I just included them for completeness):
/app/queries/gads.sql:
WITH dates AS (
SELECT
*
FROM
UNNEST(GENERATE_DATE_ARRAY('2022-03-11', DATE_SUB('2022-03-15', INTERVAL 1 DAY), INTERVAL 1 DAY)) AS date
), gads_data AS (
SELECT DATE(__hevo_report_date) AS date,
campaign,
ad_group,
sum(impressions) AS impressions,
sum(clicks) AS clicks,
ROUND(sum(cost)/1000000, 2) AS spend
FROM `table1`
WHERE DATE(__hevo_report_date) >= '2022-03-11' AND DATE(__hevo_report_date) < '2022-03-15'
GROUP BY date, campaign, ad_group
)
SELECT dates.date, campaign, ad_group, impressions, clicks, spend
FROM dates
LEFT OUTER JOIN gads_data
ON dates.date = gads_data.date
ORDER BY dates.date
# li.sql
WITH dates AS (
SELECT date
FROM
UNNEST(GENERATE_DATE_ARRAY('2022-03-11', DATE_SUB('2022-03-15', INTERVAL 1 DAY), INTERVAL 1 DAY)) AS date
), gads_data AS (
SELECT DATE(TIMESTAMP_MILLIS(date)) AS date,
campaign_id,
sum(impressions) AS impressions,
sum(clicks) AS clicks,
sum(cost_in_local_currency) AS spend
FROM `table2`
WHERE DATE(TIMESTAMP_MILLIS(date)) >= '2022-03-11' AND DATE(TIMESTAMP_MILLIS(date)) < '2022-03-15'
GROUP BY date, campaign_id
), names AS (
SELECT id, name AS campaign
FROM `ten-x-studio-data.threema.li_campaign`
)
SELECT dates.date, campaign, impressions, clicks, spend
FROM dates
LEFT OUTER JOIN gads_data
ON dates.date = gads_data.date
LEFT OUTER JOIN names
ON gads_data.campaign_id = names.id
ORDER BY dates.date
/app/queries/ms.sql:
WITH dates AS (
SELECT
*
FROM
UNNEST(GENERATE_DATE_ARRAY('2022-03-11', DATE_SUB('2022-03-15', INTERVAL 1 DAY), INTERVAL 1 DAY)) AS date
), ms_data AS (
SELECT DATE(date) AS date,
campaign_name AS campaign,
ad_group_name AS ad_group,
sum(impressions) AS impressions,
sum(clicks) AS clicks,
sum(spend) AS spend
FROM `table3`
WHERE DATE(date) >= '2022-03-11' AND DATE(date) < '2022-03-15'
GROUP BY date, campaign_name, ad_group_name
)
SELECT dates.date, campaign, ad_group, impressions, clicks, spend
FROM dates
LEFT OUTER JOIN ms_data
ON dates.date = ms_data.date
ORDER BY dates.date
/app/queries/leads.flux:
from(bucket: "bucket1")
|> range(start: 2022-03-11T00:00:00.000Z, stop: 2022-03-15T00:00:00.000Z)
|> filter(fn: (r) => r["env"] == "productive")
|> duplicate(as: "val", column: "_value")
|> difference(columns: ["val"], nonNegative: true, keepFirst: true)
|> fill(column: "val", value: 0.0)
|> group(columns: ["_field", "path"])
|> aggregateWindow(every: 1d, fn: sum, column: "val")
|> yield(name: "mean")
app/queries/subs.flux:
// Get added values
a = from(bucket: "bucket2")
|> range(start: 2022-03-10T00:00:00.000Z, stop: 2022-03-16T00:00:00.000Z)
|> filter(fn: (r) => r["env"] == "productive")
|> duplicate(column: "_value", as: "added_values")
|> difference(columns: ["added_values"], nonNegative: true, keepFirst: true)
|> fill(column: "added_values", value: 0.0)
|> aggregateWindow(every: 1d, fn: sum, column: "added_values")
|> map(fn:(r) => ({r with _time: string(v: r._time)}))
|> drop(columns:["_measurement", "_start", "_stop", "env"])
// Get total values
b = from(bucket: "bucket2")
|> range(start: 2022-03-10T00:00:00.000Z, stop: 2022-03-16T00:00:00.000Z)
|> filter(fn: (r) => r["env"] == "productive")
|> aggregateWindow(every: 1d, fn: last, column: "_value")
|> map(fn:(r) => ({r with _time: string(v: r._time)}))
|> drop(columns:["_measurement", "_start", "_stop", "env"])
// Get subtracted values
c = from(bucket: "bucket2")
|> range(start: 2022-03-10T00:00:00.000Z, stop: 2022-03-16T00:00:00.000Z)
|> filter(fn: (r) => r["env"] == "productive")
|> duplicate(column: "_value", as: "sub_values")
|> difference(columns: ["sub_values"], nonNegative: false, keepFirst: true)
|> fill(column: "sub_values", value: 0.0)
|> map(fn: (r) => ({r with sub_values: if r.sub_values < 0.0 then r.sub_values else 0.0}))
|> aggregateWindow(every: 1d, fn: sum, column: "sub_values")
|> map(fn:(r) => ({r with _time: string(v: r._time)}))
|> drop(columns:["_measurement", "_start", "_stop", "env"])
temp = join(tables: {a:a, b:b},
on: ["_field", "_time", "isTrial", "offer", "segment"]
)
res = join(tables: {temp:temp, c:c},
on: ["_field", "_time", "isTrial", "offer", "segment"]
)
|> map(fn: (r) => ({ r with _time: time(v: r._time) }))
|> range(start: 2022-03-12T00:00:00.000Z, stop: 2022-03-16T00:00:00.000Z) // delete first row
|> group()
|> sort(columns: ["_time", "_field", "isTrial", "offer", "segment"], desc: false)
|> yield()
```