lokinet/libutp/utp_internal.cpp

/*
 * Copyright (c) 2010-2013 BitTorrent, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <limits.h>  // for UINT_MAX
#include <time.h>

#include "utp_types.h"
#include "utp_packedsockaddr.h"
#include "utp_internal.h"
#include "utp_hash.h"

#define TIMEOUT_CHECK_INTERVAL 500

// number of bytes to increase max window size by, per RTT. This is
// scaled down linearly proportional to off_target. i.e. if all packets
// in one window have 0 delay, window size will increase by this number.
// Typically it's less. TCP increases one MSS per RTT, which is 1500
#define MAX_CWND_INCREASE_BYTES_PER_RTT 3000
#define CUR_DELAY_SIZE 3
// experiments suggest that a clock skew of 10 ms per 325 seconds
// is not impossible. Reset delay_base every 13 minutes. The clock
// skew is dealt with by observing the delay base in the other
// direction, and adjusting our own upwards if the opposite direction
// delay base keeps going down
#define DELAY_BASE_HISTORY 13
#define MAX_WINDOW_DECAY 100  // ms

#define REORDER_BUFFER_SIZE 32
#define REORDER_BUFFER_MAX_SIZE 1024
#define OUTGOING_BUFFER_MAX_SIZE 1024

#define PACKET_SIZE 1435

// this is the minimum max_window value. It can never drop below this
#define MIN_WINDOW_SIZE 10

// if we receive 4 or more duplicate acks, we resend the packet
// that hasn't been acked yet
#define DUPLICATE_ACKS_BEFORE_RESEND 3

// Allow a reception window of at least 3 ack_nrs behind seq_nr
// A non-SYN packet with an ack_nr difference greater than this is
// considered suspicious and ignored
#define ACK_NR_ALLOWED_WINDOW DUPLICATE_ACKS_BEFORE_RESEND

#define RST_INFO_TIMEOUT 10000
#define RST_INFO_LIMIT 1000
// 29 seconds determined from measuring many home NAT devices
#define KEEPALIVE_INTERVAL 29000

#define SEQ_NR_MASK 0xFFFF
#define ACK_NR_MASK 0xFFFF
#define TIMESTAMP_MASK 0xFFFFFFFF

#define DIV_ROUND_UP(num, denom) ((num + denom - 1) / denom)

// The totals are derived from the following data:
//  45: IPv6 address including embedded IPv4 address
//  11: Scope Id
//   2: Brackets around IPv6 address when port is present
//   6: Port (including colon)
//   1: Terminating null byte
char addrbuf[65];
#define addrfmt(x, s) x.fmt(s, sizeof(s))

#if(defined(__SVR4) && defined(__sun))
#pragma pack(1)
#else
#pragma pack(push, 1)
#endif

// these packet sizes are including the uTP header wich
// is either 20 or 23 bytes depending on version
#define PACKET_SIZE_EMPTY_BUCKET 0
#define PACKET_SIZE_EMPTY 23
#define PACKET_SIZE_SMALL_BUCKET 1
#define PACKET_SIZE_SMALL 373
#define PACKET_SIZE_MID_BUCKET 2
#define PACKET_SIZE_MID 723
#define PACKET_SIZE_BIG_BUCKET 3
#define PACKET_SIZE_BIG 1400
#define PACKET_SIZE_HUGE_BUCKET 4

struct PACKED_ATTRIBUTE PacketFormatV1
{
  // packet_type (4 high bits)
  // protocol version (4 low bits)
  byte ver_type;
  byte
  version() const
  {
    return ver_type & 0xf;
  }
  byte
  type() const
  {
    return ver_type >> 4;
  }
  void
  set_version(byte v)
  {
    ver_type = (ver_type & 0xf0) | (v & 0xf);
  }
  void
  set_type(byte t)
  {
    ver_type = (ver_type & 0xf) | (t << 4);
  }

  // Type of the first extension header
  byte ext;
  // connection ID
  uint16_big connid;
  uint32_big tv_usec;
  uint32_big reply_micro;
  // receive window size in bytes
  uint32_big windowsize;
  // Sequence number
  uint16_big seq_nr;
  // Acknowledgment number
  uint16_big ack_nr;
};

struct PACKED_ATTRIBUTE PacketFormatAckV1
{
  PacketFormatV1 pf;
  byte ext_next;
  byte ext_len;
  byte acks[4];
};

#if(defined(__SVR4) && defined(__sun))
#pragma pack(0)
#else
#pragma pack(pop)
#endif

enum
{
  ST_DATA  = 0,   // Data packet.
  ST_FIN   = 1,   // Finalize the connection. This is the last packet.
  ST_STATE = 2,   // State packet. Used to transmit an ACK with no data.
  ST_RESET = 3,   // Terminate connection forcefully.
  ST_SYN   = 4,   // Connect SYN
  ST_NUM_STATES,  // used for bounds checking
};

enum CONN_STATE
{
  CS_UNINITIALIZED = 0,
  CS_IDLE,
  CS_SYN_SENT,
  CS_SYN_RECV,
  CS_CONNECTED,
  CS_CONNECTED_FULL,
  CS_RESET,
  CS_DESTROY
};

#if UTP_DEBUG_LOGGING
static const cstr flagnames[] = {"ST_DATA", "ST_FIN", "ST_STATE", "ST_RESET",
                                 "ST_SYN"};

static const cstr statenames[] = {
    "UNINITIALIZED",  "IDLE",          "SYN_SENT", "SYN_RECV", "CONNECTED",
    "CONNECTED_FULL", "DESTROY_DELAY", "RESET",    "DESTROY"};

#endif

struct OutgoingPacket
{
  size_t length;
  size_t payload;
  uint64 time_sent;  // microseconds
  uint transmissions : 31;
  bool need_resend : 1;
  byte data[1];
};

struct SizableCircularBuffer
{
  // This is the mask. Since it's always a power of 2, adding 1 to this value
  // will return the size.
  size_t mask;
  // This is the elements that the circular buffer points to
  void **elements;

  void *
  get(size_t i) const
  {
    assert(elements);
    return elements ? elements[i & mask] : NULL;
  }
  void
  put(size_t i, void *data)
  {
    assert(elements);
    elements[i & mask] = data;
  }

  void
  grow(size_t item, size_t index);
  void
  ensure_size(size_t item, size_t index)
  {
    if(index > mask)
      grow(item, index);
  }
  size_t
  size()
  {
    return mask + 1;
  }
};

// Item contains the element we want to make space for
// index is the index in the list.
void
SizableCircularBuffer::grow(size_t item, size_t index)
{
  // Figure out the new size.
  size_t size = mask + 1;
  do
    size *= 2;
  while(index >= size);

  // Allocate the new buffer
  void **buf = (void **)calloc(size, sizeof(void *));

  size--;

  // Copy elements from the old buffer to the new buffer
  for(size_t i = 0; i <= mask; i++)
  {
    buf[(item - index + i) & size] = get(item - index + i);
  }

  // Swap to the newly allocated buffer
  mask = size;
  free(elements);
  elements = buf;
}

// compare if lhs is less than rhs, taking wrapping
// into account. if lhs is close to UINT_MAX and rhs
// is close to 0, lhs is assumed to have wrapped and
// considered smaller
bool
wrapping_compare_less(uint32 lhs, uint32 rhs, uint32 mask)
{
  // distance walking from lhs to rhs, downwards
  const uint32 dist_down = (lhs - rhs) & mask;
  // distance walking from lhs to rhs, upwards
  const uint32 dist_up = (rhs - lhs) & mask;

  // if the distance walking up is shorter, lhs
  // is less than rhs. If the distance walking down
  // is shorter, then rhs is less than lhs
  return dist_up < dist_down;
}

struct DelayHist
{
  uint32 delay_base;

  // this is the history of delay samples,
  // normalized by using the delay_base. These
  // values are always greater than 0 and measures
  // the queuing delay in microseconds
  uint32 cur_delay_hist[CUR_DELAY_SIZE];
  size_t cur_delay_idx;

  // this is the history of delay_base. It's
  // a number that doesn't have an absolute meaning
  // only relative. It doesn't make sense to initialize
  // it to anything other than values relative to
  // what's been seen in the real world.
  uint32 delay_base_hist[DELAY_BASE_HISTORY];
  size_t delay_base_idx;
  // the time when we last stepped the delay_base_idx
  uint64 delay_base_time;

  bool delay_base_initialized;

  void
  clear(uint64 current_ms)
  {
    delay_base_initialized = false;
    delay_base             = 0;
    cur_delay_idx          = 0;
    delay_base_idx         = 0;
    delay_base_time        = current_ms;
    for(size_t i = 0; i < CUR_DELAY_SIZE; i++)
    {
      cur_delay_hist[i] = 0;
    }
    for(size_t i = 0; i < DELAY_BASE_HISTORY; i++)
    {
      delay_base_hist[i] = 0;
    }
  }

  void
  shift(const uint32 offset)
  {
    // the offset should never be "negative"
    // assert(offset < 0x10000000);

    // increase all of our base delays by this amount
    // this is used to take clock skew into account
    // by observing the other side's changes in its base_delay
    for(size_t i = 0; i < DELAY_BASE_HISTORY; i++)
    {
      delay_base_hist[i] += offset;
    }
    delay_base += offset;
  }

  void
  add_sample(const uint32 sample, uint64 current_ms)
  {
    // The two clocks (in the two peers) are assumed not to
    // progress at the exact same rate. They are assumed to be
    // drifting, which causes the delay samples to contain
    // a systematic error, either they are under-
    // estimated or over-estimated. This is why we update the
    // delay_base every two minutes, to adjust for this.

    // This means the values will keep drifting and eventually wrap.
    // We can cross the wrapping boundry in two directions, either
    // going up, crossing the highest value, or going down, crossing 0.

    // if the delay_base is close to the max value and sample actually
    // wrapped on the other end we would see something like this:
    // delay_base = 0xffffff00, sample = 0x00000400
    // sample - delay_base = 0x500 which is the correct difference

    // if the delay_base is instead close to 0, and we got an even lower
    // sample (that will eventually update the delay_base), we may see
    // something like this:
    // delay_base = 0x00000400, sample = 0xffffff00
    // sample - delay_base = 0xfffffb00
    // this needs to be interpreted as a negative number and the actual
    // recorded delay should be 0.

    // It is important that all arithmetic that assume wrapping
    // is done with unsigned intergers. Signed integers are not guaranteed
    // to wrap the way unsigned integers do. At least GCC takes advantage
    // of this relaxed rule and won't necessarily wrap signed ints.

    // remove the clock offset and propagation delay.
    // delay base is min of the sample and the current
    // delay base. This min-operation is subject to wrapping
    // and care needs to be taken to correctly choose the
    // true minimum.

    // specifically the problem case is when delay_base is very small
    // and sample is very large (because it wrapped past zero), sample
    // needs to be considered the smaller

    if(!delay_base_initialized)
    {
      // delay_base being 0 suggests that we haven't initialized
      // it or its history with any real measurements yet. Initialize
      // everything with this sample.
      for(size_t i = 0; i < DELAY_BASE_HISTORY; i++)
      {
        // if we don't have a value, set it to the current sample
        delay_base_hist[i] = sample;
        continue;
      }
      delay_base             = sample;
      delay_base_initialized = true;
    }

    if(wrapping_compare_less(sample, delay_base_hist[delay_base_idx],
                             TIMESTAMP_MASK))
    {
      // sample is smaller than the current delay_base_hist entry
      // update it
      delay_base_hist[delay_base_idx] = sample;
    }

    // is sample lower than delay_base? If so, update delay_base
    if(wrapping_compare_less(sample, delay_base, TIMESTAMP_MASK))
    {
      // sample is smaller than the current delay_base
      // update it
      delay_base = sample;
    }

    // this operation may wrap, and is supposed to
    const uint32 delay = sample - delay_base;
    // sanity check. If this is triggered, something fishy is going on
    // it means the measured sample was greater than 32 seconds!
    // assert(delay < 0x2000000);

    cur_delay_hist[cur_delay_idx] = delay;
    cur_delay_idx                 = (cur_delay_idx + 1) % CUR_DELAY_SIZE;

    // once every minute
    if(current_ms - delay_base_time > 60 * 1000)
    {
      delay_base_time = current_ms;
      delay_base_idx  = (delay_base_idx + 1) % DELAY_BASE_HISTORY;
      // clear up the new delay base history spot by initializing
      // it to the current sample, then update it
      delay_base_hist[delay_base_idx] = sample;
      delay_base                      = delay_base_hist[0];
      // Assign the lowest delay in the last 2 minutes to delay_base
      for(size_t i = 0; i < DELAY_BASE_HISTORY; i++)
      {
        if(wrapping_compare_less(delay_base_hist[i], delay_base,
                                 TIMESTAMP_MASK))
          delay_base = delay_base_hist[i];
      }
    }
  }

  uint32
  get_value()
  {
    uint32 value = UINT_MAX;
    for(size_t i = 0; i < CUR_DELAY_SIZE; i++)
    {
      value = min< uint32 >(cur_delay_hist[i], value);
    }
    // value could be UINT_MAX if we have no samples yet...
    return value;
  }
};

struct UTPSocket
{
  ~UTPSocket();

  PackedSockAddr addr;
  utp_context *ctx;

  int ida;  // for ack socket list

  uint16 retransmit_count;

  uint16 reorder_count;
  byte duplicate_ack;

  // the number of packets in the send queue. Packets that haven't
  // yet been sent count as well as packets marked as needing resend
  // the oldest un-acked packet in the send queue is seq_nr - cur_window_packets
  uint16 cur_window_packets;

  // how much of the window is used, number of bytes in-flight
  // packets that have not yet been sent do not count, packets
  // that are marked as needing to be re-sent (due to a timeout)
  // don't count either
  size_t cur_window;
  // maximum window size, in bytes
  size_t max_window;
  // UTP_SNDBUF setting, in bytes
  size_t opt_sndbuf;
  // UTP_RCVBUF setting, in bytes
  size_t opt_rcvbuf;

  // this is the target delay, in microseconds
  // for this socket. defaults to 100000.
  size_t target_delay;

  // Is a FIN packet in the reassembly buffer?
  bool got_fin : 1;
  // Have we reached the FIN?
  bool got_fin_reached : 1;

  // Have we sent our FIN?
  bool fin_sent : 1;
  // Has our fin been ACKed?
  bool fin_sent_acked : 1;

  // Reading is disabled
  bool read_shutdown : 1;
  // User called utp_close()
  bool close_requested : 1;

  // Timeout procedure
  bool fast_timeout : 1;

  // max receive window for other end, in bytes
  size_t max_window_user;
  CONN_STATE state;
  // TickCount when we last decayed window (wraps)
  int64 last_rwin_decay;

  // the sequence number of the FIN packet. This field is only set
  // when we have received a FIN, and the flag field has the FIN flag set.
  // it is used to know when it is safe to destroy the socket, we must have
  // received all packets up to this sequence number first.
  uint16 eof_pkt;

  // All sequence numbers up to including this have been properly received
  // by us
  uint16 ack_nr;
  // This is the sequence number for the next packet to be sent.
  uint16 seq_nr;

  uint16 timeout_seq_nr;

  // This is the sequence number of the next packet we're allowed to
  // do a fast resend with. This makes sure we only do a fast-resend
  // once per packet. We can resend the packet with this sequence number
  // or any later packet (with a higher sequence number).
  uint16 fast_resend_seq_nr;

  uint32 reply_micro;

  uint64 last_got_packet;
  uint64 last_sent_packet;
  uint64 last_measured_delay;

  // timestamp of the last time the cwnd was full
  // this is used to prevent the congestion window
  // from growing when we're not sending at capacity
  mutable uint64 last_maxed_out_window;

  void *userdata;

  // Round trip time
  uint rtt;
  // Round trip time variance
  uint rtt_var;
  // Round trip timeout
  uint rto;
  DelayHist rtt_hist;
  uint retransmit_timeout;
  // The RTO timer will timeout here.
  uint64 rto_timeout;
  // When the window size is set to zero, start this timer. It will send a new
  // packet every 30secs.
  uint64 zerowindow_time;

  uint32 conn_seed;
  // Connection ID for packets I receive
  uint32 conn_id_recv;
  // Connection ID for packets I send
  uint32 conn_id_send;
  // Last rcv window we advertised, in bytes
  size_t last_rcv_win;

  DelayHist our_hist;
  DelayHist their_hist;

  // extension bytes from SYN packet
  byte extensions[8];

  // MTU Discovery
  // time when we should restart the MTU discovery
  uint64 mtu_discover_time;
  // ceiling and floor of binary search. last is the mtu size
  // we're currently using
  uint32 mtu_ceiling, mtu_floor, mtu_last;
  // we only ever have a single probe in flight at any given time.
  // this is the sequence number of that probe, and the size of
  // that packet
  uint32 mtu_probe_seq, mtu_probe_size;

  // this is the average delay samples, as compared to the initial
  // sample. It's averaged over 5 seconds
  int32 average_delay;
  // this is the sum of all the delay samples
  // we've made recently. The important distinction
  // of these samples is that they are all made compared
  // to the initial sample, this is to deal with
  // wrapping in a simple way.
  int64 current_delay_sum;
  // number of sample ins current_delay_sum
  int current_delay_samples;
  // initialized to 0, set to the first raw delay sample
  // each sample that's added to current_delay_sum
  // is subtracted from the value first, to make it
  // a delay relative to this sample
  uint32 average_delay_base;
  // the next time we should add an average delay
  // sample into average_delay_hist
  uint64 average_sample_time;
  // the estimated clock drift between our computer
  // and the endpoint computer. The unit is microseconds
  // per 5 seconds
  int32 clock_drift;
  // just used for logging
  int32 clock_drift_raw;

  SizableCircularBuffer inbuf, outbuf;

#ifdef _DEBUG
  // Public per-socket statistics, returned by utp_get_stats()
  utp_socket_stats _stats;
#endif

  // true if we're in slow-start (exponential growth) phase
  bool slow_start;

  // the slow-start threshold, in bytes
  size_t ssthresh;

  void
  log(int level, char const *fmt, ...)
  {
    va_list va;
    char buf[4096], buf2[4096];

    // don't bother with vsnprintf() etc calls if we're not going to log.
    if(!ctx->would_log(level))
    {
      return;
    }

    va_start(va, fmt);
    vsnprintf(buf, 4096, fmt, va);
    va_end(va);
    buf[4095] = '\0';

    snprintf(buf2, 4096, "%p %s %06u %s", this, addrfmt(addr, addrbuf),
             conn_id_recv, buf);
    buf2[4095] = '\0';

    ctx->log_unchecked(this, buf2);
  }

  void
  schedule_ack();

  // called every time mtu_floor or mtu_ceiling are adjusted
  void
  mtu_search_update();
  void
  mtu_reset();

  // Calculates the current receive window
  size_t
  get_rcv_window()
  {
    // Trim window down according to what's already in buffer.
    const size_t numbuf = utp_call_get_read_buffer_size(this->ctx, this);
    assert((int)numbuf >= 0);
    return opt_rcvbuf > numbuf ? opt_rcvbuf - numbuf : 0;
  }

  // Test if we're ready to decay max_window
  // XXX this breaks when spaced by > INT_MAX/2, which is 49
  // days; the failure mode in that case is we do an extra decay
  // or fail to do one when we really shouldn't.
  bool
  can_decay_win(int64 msec) const
  {
    return (msec - last_rwin_decay) >= MAX_WINDOW_DECAY;
  }

  // If we can, decay max window, returns true if we actually did so
  void
  maybe_decay_win(uint64 current_ms)
  {
    if(can_decay_win(current_ms))
    {
      // TCP uses 0.5
      max_window      = (size_t)(max_window * .5);
      last_rwin_decay = current_ms;
      if(max_window < MIN_WINDOW_SIZE)
        max_window = MIN_WINDOW_SIZE;
      slow_start = false;
      ssthresh   = max_window;
    }
  }

  size_t
  get_header_size() const
  {
    return sizeof(PacketFormatV1);
  }

  size_t
  get_udp_mtu()
  {
    socklen_t len;
    SOCKADDR_STORAGE sa = addr.get_sockaddr_storage(&len);
    return utp_call_get_udp_mtu(this->ctx, this, (const struct sockaddr *)&sa,
                                len);
  }

  size_t
  get_udp_overhead()
  {
    socklen_t len;
    SOCKADDR_STORAGE sa = addr.get_sockaddr_storage(&len);
    return utp_call_get_udp_overhead(this->ctx, this,
                                     (const struct sockaddr *)&sa, len);
  }

  size_t
  get_overhead()
  {
    return get_udp_overhead() + get_header_size();
  }

  void
  send_data(byte *b, size_t length, bandwidth_type_t type, uint32 flags = 0);

  void
  send_ack(bool synack = false);

  void
  send_keep_alive();

  static void
  send_rst(utp_context *ctx, const PackedSockAddr &addr, uint32 conn_id_send,
           uint16 ack_nr, uint16 seq_nr);

  void
  send_packet(OutgoingPacket *pkt);

  bool
  is_full(int bytes = -1);
  bool
  flush_packets();
  void
  write_outgoing_packet(size_t payload, uint flags, struct utp_iovec *iovec,
                        size_t num_iovecs);

#ifdef _DEBUG
  void
  check_invariant();
#endif

  void
  check_timeouts();
  int
  ack_packet(uint16 seq);
  size_t
  selective_ack_bytes(uint base, const byte *mask, byte len, int64 &min_rtt);
  void
  selective_ack(uint base, const byte *mask, byte len);
  void
  apply_ccontrol(size_t bytes_acked, uint32 actual_delay, int64 min_rtt);
  size_t
  get_packet_size() const;
};

void
removeSocketFromAckList(UTPSocket *conn)
{
  if(conn->ida >= 0)
  {
    UTPSocket *last =
        conn->ctx->ack_sockets[conn->ctx->ack_sockets.GetCount() - 1];

    assert(last->ida < (int)(conn->ctx->ack_sockets.GetCount()));
    assert(conn->ctx->ack_sockets[last->ida] == last);
    last->ida                         = conn->ida;
    conn->ctx->ack_sockets[conn->ida] = last;
    conn->ida                         = -1;

    // Decrease the count
    conn->ctx->ack_sockets.SetCount(conn->ctx->ack_sockets.GetCount() - 1);
  }
}

static void
utp_register_sent_packet(utp_context *ctx, size_t length)
{
  if(length <= PACKET_SIZE_MID)
  {
    if(length <= PACKET_SIZE_EMPTY)
    {
      ctx->context_stats._nraw_send[PACKET_SIZE_EMPTY_BUCKET]++;
    }
    else if(length <= PACKET_SIZE_SMALL)
    {
      ctx->context_stats._nraw_send[PACKET_SIZE_SMALL_BUCKET]++;
    }
    else
      ctx->context_stats._nraw_send[PACKET_SIZE_MID_BUCKET]++;
  }
  else
  {
    if(length <= PACKET_SIZE_BIG)
    {
      ctx->context_stats._nraw_send[PACKET_SIZE_BIG_BUCKET]++;
    }
    else
      ctx->context_stats._nraw_send[PACKET_SIZE_HUGE_BUCKET]++;
  }
}

void
send_to_addr(utp_context *ctx, const byte *p, size_t len,
             const PackedSockAddr &addr, int flags = 0)
{
  socklen_t tolen;
  SOCKADDR_STORAGE to = addr.get_sockaddr_storage(&tolen);
  utp_register_sent_packet(ctx, len);
  utp_call_sendto(ctx, NULL, p, len, (const struct sockaddr *)&to, tolen,
                  flags);
}

void
UTPSocket::schedule_ack()
{
  if(ida == -1)
  {
#if UTP_DEBUG_LOGGING
    log(UTP_LOG_DEBUG, "schedule_ack");
#endif
    ida = ctx->ack_sockets.Append(this);
  }
  else
  {
#if UTP_DEBUG_LOGGING
    log(UTP_LOG_DEBUG, "schedule_ack: already in list");
#endif
  }
}

void
UTPSocket::send_data(byte *b, size_t length, bandwidth_type_t type,
                     uint32 flags)
{
  // time stamp this packet with local time, the stamp goes into
  // the header of every packet at the 8th byte for 8 bytes :
  // two integers, check packet.h for more
  uint64 time = utp_call_get_microseconds(ctx, this);

  PacketFormatV1 *b1 = (PacketFormatV1 *)b;
  b1->tv_usec        = (uint32)time;
  b1->reply_micro    = reply_micro;

  last_sent_packet = ctx->current_ms;

#ifdef _DEBUG
  _stats.nbytes_xmit += length;
  ++_stats.nxmit;
#endif

  if(ctx->callbacks[UTP_ON_OVERHEAD_STATISTICS])
  {
    size_t n;
    if(type == payload_bandwidth)
    {
      // if this packet carries payload, just
      // count the header as overhead
      type = header_overhead;
      n    = get_overhead();
    }
    else
    {
      n = length + get_udp_overhead();
    }
    utp_call_on_overhead_statistics(ctx, this, true, n, type);
  }
#if UTP_DEBUG_LOGGING
  int flags2    = b1->type();
  uint16 seq_nr = b1->seq_nr;
  uint16 ack_nr = b1->ack_nr;
  log(UTP_LOG_DEBUG,
      "send %s len:%u id:%u timestamp:" I64u
      " reply_micro:%u flags:%s seq_nr:%u ack_nr:%u",
      addrfmt(addr, addrbuf), (uint)length, conn_id_send, time, reply_micro,
      flagnames[flags2], seq_nr, ack_nr);
#endif
  send_to_addr(ctx, b, length, addr, flags);
  removeSocketFromAckList(this);
}

void
UTPSocket::send_ack(bool synack)
{
  PacketFormatAckV1 pfa;
  zeromem(&pfa);

  size_t len;
  last_rcv_win = get_rcv_window();
  pfa.pf.set_version(1);
  pfa.pf.set_type(ST_STATE);
  pfa.pf.ext        = 0;
  pfa.pf.connid     = conn_id_send;
  pfa.pf.ack_nr     = ack_nr;
  pfa.pf.seq_nr     = seq_nr;
  pfa.pf.windowsize = (uint32)last_rcv_win;
  len               = sizeof(PacketFormatV1);

  // we never need to send EACK for connections
  // that are shutting down
  if(reorder_count != 0 && !got_fin_reached)
  {
    // if reorder count > 0, send an EACK.
    // reorder count should always be 0
    // for synacks, so this should not be
    // as synack
    assert(!synack);
    (void)synack;
    pfa.pf.ext   = 1;
    pfa.ext_next = 0;
    pfa.ext_len  = 4;
    uint m       = 0;

    // reorder count should only be non-zero
    // if the packet ack_nr + 1 has not yet
    // been received
    assert(inbuf.get(ack_nr + 1) == NULL);
    size_t window = min< size_t >(14 + 16, inbuf.size());
    // Generate bit mask of segments received.
    for(size_t i = 0; i < window; i++)
    {
      if(inbuf.get(ack_nr + i + 2) != NULL)
      {
        m |= 1 << i;

#if UTP_DEBUG_LOGGING
        log(UTP_LOG_DEBUG, "EACK packet [%u]", ack_nr + i + 2);
#endif
      }
    }
    pfa.acks[0] = (byte)m;
    pfa.acks[1] = (byte)(m >> 8);
    pfa.acks[2] = (byte)(m >> 16);
    pfa.acks[3] = (byte)(m >> 24);
    len += 4 + 2;

#if UTP_DEBUG_LOGGING
    log(UTP_LOG_DEBUG, "Sending EACK %u [%u] bits:[%032b]", ack_nr,
        conn_id_send, m);
#endif
  }
  else
  {
#if UTP_DEBUG_LOGGING
    log(UTP_LOG_DEBUG, "Sending ACK %u [%u]", ack_nr, conn_id_send);
#endif
  }

  send_data((byte *)&pfa, len, ack_overhead);
  removeSocketFromAckList(this);
}

void
UTPSocket::send_keep_alive()
{
  ack_nr--;

#if UTP_DEBUG_LOGGING
  log(UTP_LOG_DEBUG, "Sending KeepAlive ACK %u [%u]", ack_nr, conn_id_send);
#endif

  send_ack();
  ack_nr++;
}

void
UTPSocket::send_rst(utp_context *ctx, const PackedSockAddr &addr,
                    uint32 conn_id_send, uint16 ack_nr, uint16 seq_nr)
{
  PacketFormatV1 pf1;
  zeromem(&pf1);

  size_t len;
  pf1.set_version(1);
  pf1.set_type(ST_RESET);
  pf1.ext        = 0;
  pf1.connid     = conn_id_send;
  pf1.ack_nr     = ack_nr;
  pf1.seq_nr     = seq_nr;
  pf1.windowsize = 0;
  len            = sizeof(PacketFormatV1);

  //	LOG_DEBUG("%s: Sending RST id:%u seq_nr:%u ack_nr:%u", addrfmt(addr,
  // addrbuf), conn_id_send, seq_nr, ack_nr); 	LOG_DEBUG("send %s len:%u
  // id:%u", addrfmt(addr, addrbuf), (uint)len, conn_id_send);
  send_to_addr(ctx, (const byte *)&pf1, len, addr);
}

void
UTPSocket::send_packet(OutgoingPacket *pkt)
{
  // only count against the quota the first time we
  // send the packet. Don't enforce quota when closing
  // a socket. Only enforce the quota when we're sending
  // at slow rates (max window < packet size)

  // size_t max_send = min(max_window, opt_sndbuf, max_window_user);
  time_t cur_time = utp_call_get_milliseconds(this->ctx, this);

  if(pkt->transmissions == 0 || pkt->need_resend)
  {
    cur_window += pkt->payload;
  }

  pkt->need_resend = false;

  PacketFormatV1 *p1 = (PacketFormatV1 *)pkt->data;
  p1->ack_nr         = ack_nr;
  pkt->time_sent     = utp_call_get_microseconds(this->ctx, this);

  // socklen_t salen;
  // SOCKADDR_STORAGE sa = addr.get_sockaddr_storage(&salen);
  bool use_as_mtu_probe = false;

  // TODO: this is subject to nasty wrapping issues! Below as well
  if(mtu_discover_time < (uint64)cur_time)
  {
    // it's time to reset our MTU assupmtions
    // and trigger a new search
    mtu_reset();
  }

  // don't use packets that are larger then mtu_ceiling
  // as probes, since they were probably used as probes
  // already and failed, now we need it to fragment
  // just to get it through
  // if seq_nr == 1, the probe would end up being 0
  // which is a magic number representing no-probe
  // that why we don't send a probe for a packet with
  // sequence number 0
  if(mtu_floor < mtu_ceiling && pkt->length > mtu_floor
     && pkt->length <= mtu_ceiling && mtu_probe_seq == 0 && seq_nr != 1
     && pkt->transmissions == 0)
  {
    // we've already incremented seq_nr
    // for this packet
    mtu_probe_seq  = (seq_nr - 1) & ACK_NR_MASK;
    mtu_probe_size = pkt->length;
    assert(pkt->length >= mtu_floor);
    assert(pkt->length <= mtu_ceiling);
    use_as_mtu_probe = true;
    log(UTP_LOG_MTU, "MTU [PROBE] floor:%d ceiling:%d current:%d", mtu_floor,
        mtu_ceiling, mtu_probe_size);
  }

  pkt->transmissions++;
  send_data(
      (byte *)pkt->data, pkt->length,
      (state == CS_SYN_SENT)
          ? connect_overhead
          : (pkt->transmissions == 1) ? payload_bandwidth : retransmit_overhead,
      use_as_mtu_probe ? UTP_UDP_DONTFRAG : 0);
}

bool
UTPSocket::is_full(int bytes)
{
  size_t packet_size = get_packet_size();
  if(bytes < 0)
    bytes = packet_size;
  else if(bytes > (int)packet_size)
    bytes = (int)packet_size;
  size_t max_send = min(max_window, opt_sndbuf, max_window_user);

  // subtract one to save space for the FIN packet
  if(cur_window_packets >= OUTGOING_BUFFER_MAX_SIZE - 1)
  {
#if UTP_DEBUG_LOGGING
    log(UTP_LOG_DEBUG, "is_full:false cur_window_packets:%d MAX:%d",
        cur_window_packets, OUTGOING_BUFFER_MAX_SIZE - 1);
#endif

    last_maxed_out_window = ctx->current_ms;
    return true;
  }

#if UTP_DEBUG_LOGGING
  log(UTP_LOG_DEBUG,
      "is_full:%s. cur_window:%u pkt:%u max:%u cur_window_packets:%u "
      "max_window:%u",
      (cur_window + bytes > max_send) ? "true" : "false", cur_window, bytes,
      max_send, cur_window_packets, max_window);
#endif

  if(cur_window + bytes > max_send)
  {
    last_maxed_out_window = ctx->current_ms;
    return true;
  }
  return false;
}

bool
UTPSocket::flush_packets()
{
  size_t packet_size = get_packet_size();

  // send packets that are waiting on the pacer to be sent
  // i has to be an unsigned 16 bit counter to wrap correctly
  // signed types are not guaranteed to wrap the way you expect
  for(uint16 i = seq_nr - cur_window_packets; i != seq_nr; ++i)
  {
    OutgoingPacket *pkt = (OutgoingPacket *)outbuf.get(i);
    if(pkt == 0 || (pkt->transmissions > 0 && pkt->need_resend == false))
      continue;
    // have we run out of quota?
    if(is_full())
      return true;

    // Nagle check
    // don't send the last packet if we have one packet in-flight
    // and the current packet is still smaller than packet_size.
    if(i != ((seq_nr - 1) & ACK_NR_MASK) || cur_window_packets == 1
       || pkt->payload >= packet_size)
    {
      send_packet(pkt);
    }
  }
  return false;
}

// @payload: number of bytes to send
// @flags: either ST_DATA, or ST_FIN
// @iovec: base address of iovec array
// @num_iovecs: number of iovecs in array
void
UTPSocket::write_outgoing_packet(size_t payload, uint flags,
                                 struct utp_iovec *iovec, size_t num_iovecs)
{
  // Setup initial timeout timer
  if(cur_window_packets == 0)
  {
    retransmit_timeout = rto;
    rto_timeout        = ctx->current_ms + retransmit_timeout;
    assert(cur_window == 0);
  }

  size_t packet_size = get_packet_size();
  do
  {
    assert(cur_window_packets < OUTGOING_BUFFER_MAX_SIZE);
    assert(flags == ST_DATA || flags == ST_FIN);

    size_t added = 0;

    OutgoingPacket *pkt = NULL;

    if(cur_window_packets > 0)
    {
      pkt = (OutgoingPacket *)outbuf.get(seq_nr - 1);
    }

    const size_t header_size = get_header_size();
    bool append              = true;

    // if there's any room left in the last packet in the window
    // and it hasn't been sent yet, fill that frame first
    if(payload && pkt && !pkt->transmissions && pkt->payload < packet_size)
    {
      // Use the previous unsent packet
      added =
          min(payload + pkt->payload, max< size_t >(packet_size, pkt->payload))
          - pkt->payload;
      pkt = (OutgoingPacket *)realloc(
          pkt,
          (sizeof(OutgoingPacket) - 1) + header_size + pkt->payload + added);
      outbuf.put(seq_nr - 1, pkt);
      append = false;
      assert(!pkt->need_resend);
    }
    else
    {
      // Create the packet to send.
      added = payload;
      pkt = (OutgoingPacket *)malloc((sizeof(OutgoingPacket) - 1) + header_size
                                     + added);
      pkt->payload       = 0;
      pkt->transmissions = 0;
      pkt->need_resend   = false;
    }

    if(added)
    {
      assert(flags == ST_DATA);

      // Fill it with data from the upper layer.
      unsigned char *p = pkt->data + header_size + pkt->payload;
      size_t needed    = added;

      /*
      while (needed) {
              *p = *(char*)iovec[0].iov_base;
              p++;
              iovec[0].iov_base = (char *)iovec[0].iov_base + 1;
              needed--;
      }
      */

      for(size_t i = 0; i < num_iovecs && needed; i++)
      {
        if(iovec[i].iov_len == 0)
          continue;

        size_t num = min< size_t >(needed, iovec[i].iov_len);
        memcpy(p, iovec[i].iov_base, num);

        p += num;

        iovec[i].iov_len -= num;
        iovec[i].iov_base = (byte *)iovec[i].iov_base
            + num;  // iovec[i].iov_base += num, but without void* pointers
        needed -= num;
      }

      assert(needed == 0);
    }
    pkt->payload += added;
    pkt->length = header_size + pkt->payload;

    last_rcv_win = get_rcv_window();

    PacketFormatV1 *p1 = (PacketFormatV1 *)pkt->data;
    // p1->ver_type; needs to be set!!
    p1->set_version(1);
    p1->set_type(flags);
    p1->ext        = 0;
    p1->connid     = conn_id_send;
    p1->windowsize = (uint32)last_rcv_win;
    p1->ack_nr     = ack_nr;

    if(append)
    {
      // Remember the message in the outgoing queue.
      outbuf.ensure_size(seq_nr, cur_window_packets);
      outbuf.put(seq_nr, pkt);
      p1->seq_nr = seq_nr;
      seq_nr++;
      cur_window_packets++;
    }

    payload -= added;

  } while(payload);

  flush_packets();
}

#ifdef _DEBUG
void
UTPSocket::check_invariant()
{
  if(reorder_count > 0)
  {
    assert(inbuf.get(ack_nr + 1) == NULL);
  }

  size_t outstanding_bytes = 0;
  for(int i = 0; i < cur_window_packets; ++i)
  {
    OutgoingPacket *pkt = (OutgoingPacket *)outbuf.get(seq_nr - i - 1);
    if(pkt == 0 || pkt->transmissions == 0 || pkt->need_resend)
      continue;
    outstanding_bytes += pkt->payload;
  }
  assert(outstanding_bytes == cur_window);
}
#endif

void
UTPSocket::check_timeouts()
{
#ifdef _DEBUG
  check_invariant();
#endif

  // this invariant should always be true
  assert(cur_window_packets == 0 || outbuf.get(seq_nr - cur_window_packets));

#if UTP_DEBUG_LOGGING
  log(UTP_LOG_DEBUG,
      "CheckTimeouts timeout:%d max_window:%u cur_window:%u "
      "state:%s cur_window_packets:%u",
      (int)(rto_timeout - ctx->current_ms), (uint)max_window, (uint)cur_window,
      statenames[state], cur_window_packets);
#endif

  if(state != CS_DESTROY)
    flush_packets();

  switch(state)
  {
    case CS_SYN_SENT:
    case CS_SYN_RECV:
    case CS_CONNECTED_FULL:
    case CS_CONNECTED:
    {
      // Reset max window...
      if((int)(ctx->current_ms - zerowindow_time) >= 0 && max_window_user == 0)
      {
        max_window_user = PACKET_SIZE;
      }

      if((int)(ctx->current_ms - rto_timeout) >= 0 && rto_timeout > 0)
      {
        bool ignore_loss = false;

        if(cur_window_packets == 1
           && ((seq_nr - 1) & ACK_NR_MASK) == mtu_probe_seq
           && mtu_probe_seq != 0)
        {
          // we only had  a single outstanding packet that timed out, and it was
          // the probe
          mtu_ceiling = mtu_probe_size - 1;
          mtu_search_update();
          // this packet was most likely dropped because the packet size being
          // too big and not because congestion. To accelerate the binary search
          // for the MTU, resend immediately and don't reset the window size
          ignore_loss = true;
          log(UTP_LOG_MTU, "MTU [PROBE-TIMEOUT] floor:%d ceiling:%d current:%d",
              mtu_floor, mtu_ceiling, mtu_last);
        }
        // we dropepd the probe, clear these fields to
        // allow us to send a new one
        mtu_probe_seq = mtu_probe_size = 0;
        log(UTP_LOG_MTU, "MTU [TIMEOUT]");

        /*
        OutgoingPacket *pkt = (OutgoingPacket*)outbuf.get(seq_nr -
        cur_window_packets);

        // If there were a lot of retransmissions, force recomputation of round
        trip time if (pkt->transmissions >= 4) rtt = 0;
        */

        // Increase RTO
        const uint new_timeout =
            ignore_loss ? retransmit_timeout : retransmit_timeout * 2;

        // They initiated the connection but failed to respond before the rto.
        // A malicious client can also spoof the destination address of a ST_SYN
        // bringing us to this state. Kill the connection and do not notify the
        // upper layer
        if(state == CS_SYN_RECV)
        {
          state = CS_DESTROY;
          utp_call_on_error(ctx, this, UTP_ETIMEDOUT);
          return;
        }

        // We initiated the connection but the other side failed to respond
        // before the rto
        if(retransmit_count >= 4
           || (state == CS_SYN_SENT && retransmit_count >= 2))
        {
          // 4 consecutive transmissions have timed out. Kill it. If we
          // haven't even connected yet, give up after only 2 consecutive
          // failed transmissions.
          if(close_requested)
            state = CS_DESTROY;
          else
            state = CS_RESET;
          utp_call_on_error(ctx, this, UTP_ETIMEDOUT);
          return;
        }

        retransmit_timeout = new_timeout;
        rto_timeout        = ctx->current_ms + new_timeout;

        if(!ignore_loss)
        {
          // On Timeout
          duplicate_ack = 0;

          int packet_size = get_packet_size();

          if((cur_window_packets == 0) && ((int)max_window > packet_size))
          {
            // we don't have any packets in-flight, even though
            // we could. This implies that the connection is just
            // idling. No need to be aggressive about resetting the
            // congestion window. Just let it decay by a 3:rd.
            // don't set it any lower than the packet size though
            max_window = max(max_window * 2 / 3, size_t(packet_size));
          }
          else
          {
            // our delay was so high that our congestion window
            // was shrunk below one packet, preventing us from
            // sending anything for one time-out period. Now, reset
            // the congestion window to fit one packet, to start over
            // again
            max_window = packet_size;
            slow_start = true;
          }
        }

        // every packet should be considered lost
        for(int i = 0; i < cur_window_packets; ++i)
        {
          OutgoingPacket *pkt = (OutgoingPacket *)outbuf.get(seq_nr - i - 1);
          if(pkt == 0 || pkt->transmissions == 0 || pkt->need_resend)
            continue;
          pkt->need_resend = true;
          assert(cur_window >= pkt->payload);
          cur_window -= pkt->payload;
        }

        if(cur_window_packets > 0)
        {
          retransmit_count++;
          // used in parse_log.py
          log(UTP_LOG_NORMAL,
              "Packet timeout. Resend. seq_nr:%u. timeout:%u "
              "max_window:%u cur_window_packets:%d",
              seq_nr - cur_window_packets, retransmit_timeout, (uint)max_window,
              int(cur_window_packets));

          fast_timeout   = true;
          timeout_seq_nr = seq_nr;

          OutgoingPacket *pkt =
              (OutgoingPacket *)outbuf.get(seq_nr - cur_window_packets);
          assert(pkt);

          // Re-send the packet.
          send_packet(pkt);
        }
      }

      // Mark the socket as writable. If the cwnd has grown, or if the number of
      // bytes in-flight is lower than cwnd, we need to make the socket writable
      // again in case it isn't
      if(state == CS_CONNECTED_FULL && !is_full())
      {
        state = CS_CONNECTED;

#if UTP_DEBUG_LOGGING
        log(UTP_LOG_DEBUG,
            "Socket writable. max_window:%u cur_window:%u packet_size:%u",
            (uint)max_window, (uint)cur_window, (uint)get_packet_size());
#endif
        utp_call_on_state_change(this->ctx, this, UTP_STATE_WRITABLE);
      }

      if(state >= CS_CONNECTED && !fin_sent)
      {
        if((int)(ctx->current_ms - last_sent_packet) >= KEEPALIVE_INTERVAL)
        {
          send_keep_alive();
        }
      }
      break;
    }

    // prevent warning
    case CS_UNINITIALIZED:
    case CS_IDLE:
    case CS_RESET:
    case CS_DESTROY:
      break;
  }
}

// this should be called every time we change mtu_floor or mtu_ceiling
void
UTPSocket::mtu_search_update()
{
  assert(mtu_floor <= mtu_ceiling);

  // binary search
  mtu_last = (mtu_floor + mtu_ceiling) / 2;

  // enable a new probe to be sent
  mtu_probe_seq = mtu_probe_size = 0;

  // if the floor and ceiling are close enough, consider the
  // MTU binary search complete. We set the current value
  // to floor since that's the only size we know can go through
  // also set the ceiling to floor to terminate the searching
  if(mtu_ceiling - mtu_floor <= 16)
  {
    mtu_last = mtu_floor;
    log(UTP_LOG_MTU, "MTU [DONE] floor:%d ceiling:%d current:%d", mtu_floor,
        mtu_ceiling, mtu_last);
    mtu_ceiling = mtu_floor;
    assert(mtu_floor <= mtu_ceiling);
    // Do another search in 30 minutes
    mtu_discover_time =
        utp_call_get_milliseconds(this->ctx, this) + 30 * 60 * 1000;
  }
}

void
UTPSocket::mtu_reset()
{
  mtu_ceiling = get_udp_mtu();
  // Less would not pass TCP...
  mtu_floor = 576;
  log(UTP_LOG_MTU, "MTU [RESET] floor:%d ceiling:%d current:%d", mtu_floor,
      mtu_ceiling, mtu_last);
  assert(mtu_floor <= mtu_ceiling);
  mtu_discover_time =
      utp_call_get_milliseconds(this->ctx, this) + 30 * 60 * 1000;
}

// returns:
// 0: the packet was acked.
// 1: it means that the packet had already been acked
// 2: the packet has not been sent yet
int
UTPSocket::ack_packet(uint16 seq)
{
  OutgoingPacket *pkt = (OutgoingPacket *)outbuf.get(seq);

  // the packet has already been acked (or not sent)
  if(pkt == NULL)
  {
#if UTP_DEBUG_LOGGING
    log(UTP_LOG_DEBUG, "got ack for:%u (already acked, or never sent)", seq);
#endif

    return 1;
  }

  // can't ack packets that haven't been sent yet!
  if(pkt->transmissions == 0)
  {
#if UTP_DEBUG_LOGGING
    log(UTP_LOG_DEBUG,
        "got ack for:%u (never sent, pkt_size:%u need_resend:%u)", seq,
        (uint)pkt->payload, pkt->need_resend);
#endif

    return 2;
  }

#if UTP_DEBUG_LOGGING
  log(UTP_LOG_DEBUG, "got ack for:%u (pkt_size:%u need_resend:%u)", seq,
      (uint)pkt->payload, pkt->need_resend);
#endif

  outbuf.put(seq, NULL);

  // if we never re-sent the packet, update the RTT estimate
  if(pkt->transmissions == 1)
  {
    // Estimate the round trip time.
    const uint32 ertt = (uint32)(
        (utp_call_get_microseconds(this->ctx, this) - pkt->time_sent) / 1000);
    if(rtt == 0)
    {
      // First round trip time sample
      rtt     = ertt;
      rtt_var = ertt / 2;
      // sanity check. rtt should never be more than 6 seconds
      //			assert(rtt < 6000);
    }
    else
    {
      // Compute new round trip times
      const int delta = (int)rtt - ertt;
      rtt_var         = rtt_var + (int)(abs(delta) - rtt_var) / 4;
      rtt             = rtt - rtt / 8 + ertt / 8;
      // sanity check. rtt should never be more than 6 seconds
      //			assert(rtt < 6000);
      rtt_hist.add_sample(ertt, ctx->current_ms);
    }
    rto = max< uint >(rtt + rtt_var * 4, 1000);

#if UTP_DEBUG_LOGGING
    log(UTP_LOG_DEBUG, "rtt:%u avg:%u var:%u rto:%u", ertt, rtt, rtt_var, rto);
#endif
  }
  retransmit_timeout = rto;
  rto_timeout        = ctx->current_ms + rto;
  // if need_resend is set, this packet has already
  // been considered timed-out, and is not included in
  // the cur_window anymore
  if(!pkt->need_resend)
  {
    assert(cur_window >= pkt->payload);
    cur_window -= pkt->payload;
  }
  free(pkt);
  retransmit_count = 0;
  return 0;
}

// count the number of bytes that were acked by the EACK header
size_t
UTPSocket::selective_ack_bytes(uint base, const byte *mask, byte len,
                               int64 &min_rtt)
{
  if(cur_window_packets == 0)
    return 0;

  size_t acked_bytes = 0;
  int bits           = len * 8;
  uint64 now         = utp_call_get_microseconds(this->ctx, this);

  do
  {
    uint v = base + bits;

    // ignore bits that haven't been sent yet
    // see comment in UTPSocket::selective_ack
    if(((seq_nr - v - 1) & ACK_NR_MASK) >= (uint16)(cur_window_packets - 1))
      continue;

    // ignore bits that represents packets we haven't sent yet
    // or packets that have already been acked
    OutgoingPacket *pkt = (OutgoingPacket *)outbuf.get(v);
    if(!pkt || pkt->transmissions == 0)
      continue;

    // Count the number of segments that were successfully received past it.
    if(bits >= 0 && mask[bits >> 3] & (1 << (bits & 7)))
    {
      assert((int)(pkt->payload) >= 0);
      acked_bytes += pkt->payload;
      if(pkt->time_sent < now)
        min_rtt = min< int64 >(min_rtt, now - pkt->time_sent);
      else
        min_rtt = min< int64 >(min_rtt, 50000);
      continue;
    }
  } while(--bits >= -1);
  return acked_bytes;
}

enum
{
  MAX_EACK = 128
};

void
UTPSocket::selective_ack(uint base, const byte *mask, byte len)
{
  if(cur_window_packets == 0)
    return;

  // the range is inclusive [0, 31] bits
  int bits = len * 8 - 1;

  int count = 0;

  // resends is a stack of sequence numbers we need to resend. Since we
  // iterate in reverse over the acked packets, at the end, the top packets
  // are the ones we want to resend
  int resends[MAX_EACK];
  int nr = 0;

#if UTP_DEBUG_LOGGING
  char bitmask[1024] = {0};
  int counter        = bits;
  for(int i = 0; i <= bits; ++i)
  {
    bool bit_set = counter >= 0 && mask[counter >> 3] & (1 << (counter & 7));
    bitmask[i]   = bit_set ? '1' : '0';
    --counter;
  }

  log(UTP_LOG_DEBUG, "Got EACK [%s] base:%u", bitmask, base);
#endif

  do
  {
    // we're iterating over the bits from higher sequence numbers
    // to lower (kind of in reverse order, wich might not be very
    // intuitive)
    uint v = base + bits;

    // ignore bits that haven't been sent yet
    // and bits that fall below the ACKed sequence number
    // this can happen if an EACK message gets
    // reordered and arrives after a packet that ACKs up past
    // the base for thie EACK message

    // this is essentially the same as:
    // if v >= seq_nr || v <= seq_nr - cur_window_packets
    // but it takes wrapping into account

    // if v == seq_nr the -1 will make it wrap. if v > seq_nr
    // it will also wrap (since it will fall further below 0)
    // and be > cur_window_packets.
    // if v == seq_nr - cur_window_packets, the result will be
    // seq_nr - (seq_nr - cur_window_packets) - 1
    // == seq_nr - seq_nr + cur_window_packets - 1
    // == cur_window_packets - 1 which will be caught by the
    // test. If v < seq_nr - cur_window_packets the result will grow
    // fall furhter outside of the cur_window_packets range.

    // sequence number space:
    //
    //     rejected <   accepted   > rejected
    // <============+--------------+============>
    //              ^              ^
    //              |              |
    //        (seq_nr-wnd)         seq_nr

    if(((seq_nr - v - 1) & ACK_NR_MASK) >= (uint16)(cur_window_packets - 1))
      continue;

    // this counts as a duplicate ack, even though we might have
    // received an ack for this packet previously (in another EACK
    // message for instance)
    bool bit_set = bits >= 0 && mask[bits >> 3] & (1 << (bits & 7));

    // if this packet is acked, it counts towards the duplicate ack counter
    if(bit_set)
      count++;

    // ignore bits that represents packets we haven't sent yet
    // or packets that have already been acked
    OutgoingPacket *pkt = (OutgoingPacket *)outbuf.get(v);
    if(!pkt || pkt->transmissions == 0)
    {
#if UTP_DEBUG_LOGGING
      log(UTP_LOG_DEBUG, "skipping %u. pkt:%08x transmissions:%u %s", v, pkt,
          pkt ? pkt->transmissions : 0,
          pkt ? "(not sent yet?)" : "(already acked?)");
#endif
      continue;
    }

    // Count the number of segments that were successfully received past it.
    if(bit_set)
    {
      // the selective ack should never ACK the packet we're waiting for to
      // decrement cur_window_packets
      assert((v & outbuf.mask)
             != ((seq_nr - cur_window_packets) & outbuf.mask));
      ack_packet(v);
      continue;
    }

    // Resend segments
    // if count is less than our re-send limit, we haven't seen enough
    // acked packets in front of this one to warrant a re-send.
    // if count == 0, we're still going through the tail of zeroes
    if(((v - fast_resend_seq_nr) & ACK_NR_MASK) <= OUTGOING_BUFFER_MAX_SIZE
       && count >= DUPLICATE_ACKS_BEFORE_RESEND)
    {
      // resends is a stack, and we're mostly interested in the top of it
      // if we're full, just throw away the lower half
      if(nr >= MAX_EACK - 2)
      {
        memmove(resends, &resends[MAX_EACK / 2],
                MAX_EACK / 2 * sizeof(resends[0]));
        nr -= MAX_EACK / 2;
      }
      resends[nr++] = v;

#if UTP_DEBUG_LOGGING
      log(UTP_LOG_DEBUG, "no ack for %u", v);
#endif
    }
    else
    {
#if UTP_DEBUG_LOGGING
      log(UTP_LOG_DEBUG,
          "not resending %u count:%d dup_ack:%u fast_resend_seq_nr:%u", v,
          count, duplicate_ack, fast_resend_seq_nr);
#endif
    }
  } while(--bits >= -1);

  if(((base - 1 - fast_resend_seq_nr) & ACK_NR_MASK) <= OUTGOING_BUFFER_MAX_SIZE
     && count >= DUPLICATE_ACKS_BEFORE_RESEND)
  {
    // if we get enough duplicate acks to start
    // resending, the first packet we should resend
    // is base-1
    resends[nr++] = (base - 1) & ACK_NR_MASK;

#if UTP_DEBUG_LOGGING
    log(UTP_LOG_DEBUG, "no ack for %u", (base - 1) & ACK_NR_MASK);
#endif
  }
  else
  {
#if UTP_DEBUG_LOGGING
    log(UTP_LOG_DEBUG,
        "not resending %u count:%d dup_ack:%u fast_resend_seq_nr:%u", base - 1,
        count, duplicate_ack, fast_resend_seq_nr);
#endif
  }

  bool back_off = false;
  int i         = 0;
  while(nr > 0)
  {
    uint v = resends[--nr];
    // don't consider the tail of 0:es to be lost packets
    // only unacked packets with acked packets after should
    // be considered lost
    OutgoingPacket *pkt = (OutgoingPacket *)outbuf.get(v);

    // this may be an old (re-ordered) packet, and some of the
    // packets in here may have been acked already. In which
    // case they will not be in the send queue anymore
    if(!pkt)
      continue;

    // used in parse_log.py
    log(UTP_LOG_NORMAL, "Packet %u lost. Resending", v);

    // On Loss
    back_off = true;

#ifdef _DEBUG
    ++_stats.rexmit;
#endif

    send_packet(pkt);
    fast_resend_seq_nr = (v + 1) & ACK_NR_MASK;

    // Re-send max 4 packets.
    if(++i >= 4)
      break;
  }

  if(back_off)
    maybe_decay_win(ctx->current_ms);

  duplicate_ack = count;
}

void
UTPSocket::apply_ccontrol(size_t bytes_acked, uint32 actual_delay,
                          int64 min_rtt)
{
  // the delay can never be greater than the rtt. The min_rtt
  // variable is the RTT in microseconds

  assert(min_rtt >= 0);
  int32 our_delay = min< uint32 >(our_hist.get_value(), uint32(min_rtt));
  assert(our_delay != INT_MAX);
  assert(our_delay >= 0);

  utp_call_on_delay_sample(this->ctx, this, our_delay / 1000);

  // This test the connection under heavy load from foreground
  // traffic. Pretend that our delays are very high to force the
  // connection to use sub-packet size window sizes
  // our_delay *= 4;

  // target is microseconds
  int target = target_delay;
  if(target <= 0)
    target = 100000;

  // this is here to compensate for very large clock drift that affects
  // the congestion controller into giving certain endpoints an unfair
  // share of the bandwidth. We have an estimate of the clock drift
  // (clock_drift). The unit of this is microseconds per 5 seconds.
  // empirically, a reasonable cut-off appears to be about 200000
  // (which is pretty high). The main purpose is to compensate for
  // people trying to "cheat" uTP by making their clock run slower,
  // and this definitely catches that without any risk of false positives
  // if clock_drift < -200000 start applying a penalty delay proportional
  // to how far beoynd -200000 the clock drift is
  int32 penalty = 0;
  if(clock_drift < -200000)
  {
    penalty = (-clock_drift - 200000) / 7;
    our_delay += penalty;
  }

  double off_target = target - our_delay;

  // this is the same as:
  //
  //    (min(off_target, target) / target) * (bytes_acked / max_window) *
  //    MAX_CWND_INCREASE_BYTES_PER_RTT
  //
  // so, it's scaling the max increase by the fraction of the window this ack
  // represents, and the fraction of the target delay the current delay
  // represents. The min() around off_target protects against crazy values of
  // our_delay, which may happen when th timestamps wraps, or by just having a
  // malicious peer sending garbage. This caps the increase of the window size
  // to MAX_CWND_INCREASE_BYTES_PER_RTT per rtt. as for large negative numbers,
  // this direction is already capped at the min packet size further down the
  // min around the bytes_acked protects against the case where the window size
  // was recently shrunk and the number of acked bytes exceeds that. This is
  // considered no more than one full window, in order to keep the gain within
  // sane boundries.

  assert(bytes_acked > 0);
  double window_factor = (double)min(bytes_acked, max_window)
      / (double)max(max_window, bytes_acked);

  double delay_factor = off_target / target;
  double scaled_gain =
      MAX_CWND_INCREASE_BYTES_PER_RTT * window_factor * delay_factor;

  // since MAX_CWND_INCREASE_BYTES_PER_RTT is a cap on how much the window size
  // (max_window) may increase per RTT, we may not increase the window size more
  // than that proportional to the number of bytes that were acked, so that once
  // one window has been acked (one rtt) the increase limit is not exceeded the
  // +1. is to allow for floating point imprecision
  assert(scaled_gain <= 1.
             + MAX_CWND_INCREASE_BYTES_PER_RTT
                 * (double)min(bytes_acked, max_window)
                 / (double)max(max_window, bytes_acked));

  if(scaled_gain > 0 && ctx->current_ms - last_maxed_out_window > 1000)
  {
    // if it was more than 1 second since we tried to send a packet
    // and stopped because we hit the max window, we're most likely rate
    // limited (which prevents us from ever hitting the window size)
    // if this is the case, we cannot let the max_window grow indefinitely
    scaled_gain = 0;
  }

  size_t ledbat_cwnd = (max_window + scaled_gain < MIN_WINDOW_SIZE)
      ? MIN_WINDOW_SIZE
      : (size_t)(max_window + scaled_gain);

  if(slow_start)
  {
    size_t ss_cwnd = (size_t)(max_window + window_factor * get_packet_size());
    if(ss_cwnd > ssthresh)
    {
      slow_start = false;
    }
    else if(our_delay > target * 0.9)
    {
      // even if we're a little under the target delay, we conservatively
      // discontinue the slow start phase
      slow_start = false;
      ssthresh   = max_window;
    }
    else
    {
      max_window = max(ss_cwnd, ledbat_cwnd);
    }
  }
  else
  {
    max_window = ledbat_cwnd;
  }

  // make sure that the congestion window is below max
  // make sure that we don't shrink our window too small
  max_window = clamp< size_t >(max_window, MIN_WINDOW_SIZE, opt_sndbuf);

  // used in parse_log.py
  log(UTP_LOG_NORMAL,
      "actual_delay:%u our_delay:%d their_delay:%u off_target:%d max_window:%u "
      "delay_base:%u delay_sum:%d target_delay:%d acked_bytes:%u cur_window:%u "
      "scaled_gain:%f rtt:%u rate:%u wnduser:%u rto:%u timeout:%d "
      "get_microseconds:" I64u
      " "
      "cur_window_packets:%u packet_size:%u their_delay_base:%u "
      "their_actual_delay:%u "
      "average_delay:%d clock_drift:%d clock_drift_raw:%d delay_penalty:%d "
      "current_delay_sum:" I64u
      "current_delay_samples:%d average_delay_base:%d "
      "last_maxed_out_window:" I64u
      " opt_sndbuf:%d "
      "current_ms:" I64u "",
      actual_delay, our_delay / 1000, their_hist.get_value() / 1000,
      int(off_target / 1000), uint(max_window), uint32(our_hist.delay_base),
      int((our_delay + their_hist.get_value()) / 1000), int(target / 1000),
      uint(bytes_acked), (uint)(cur_window - bytes_acked), (float)(scaled_gain),
      rtt,
      (uint)(max_window * 1000
             / (rtt_hist.delay_base ? rtt_hist.delay_base : 50)),
      (uint)max_window_user, rto, (int)(rto_timeout - ctx->current_ms),
      utp_call_get_microseconds(this->ctx, this), cur_window_packets,
      (uint)get_packet_size(), their_hist.delay_base,
      their_hist.delay_base + their_hist.get_value(), average_delay,
      clock_drift, clock_drift_raw, penalty / 1000, current_delay_sum,
      current_delay_samples, average_delay_base, uint64(last_maxed_out_window),
      int(opt_sndbuf), uint64(ctx->current_ms));
}

static void
utp_register_recv_packet(UTPSocket *conn, size_t len)
{
#ifdef _DEBUG
  ++conn->_stats.nrecv;
  conn->_stats.nbytes_recv += len;
#endif

  if(len <= PACKET_SIZE_MID)
  {
    if(len <= PACKET_SIZE_EMPTY)
    {
      conn->ctx->context_stats._nraw_recv[PACKET_SIZE_EMPTY_BUCKET]++;
    }
    else if(len <= PACKET_SIZE_SMALL)
    {
      conn->ctx->context_stats._nraw_recv[PACKET_SIZE_SMALL_BUCKET]++;
    }
    else
      conn->ctx->context_stats._nraw_recv[PACKET_SIZE_MID_BUCKET]++;
  }
  else
  {
    if(len <= PACKET_SIZE_BIG)
    {
      conn->ctx->context_stats._nraw_recv[PACKET_SIZE_BIG_BUCKET]++;
    }
    else
      conn->ctx->context_stats._nraw_recv[PACKET_SIZE_HUGE_BUCKET]++;
  }
}

// returns the max number of bytes of payload the uTP
// connection is allowed to send
size_t
UTPSocket::get_packet_size() const
{
  int header_size = sizeof(PacketFormatV1);
  size_t mtu      = mtu_last ? mtu_last : mtu_ceiling;
  return mtu - header_size;
}

// Process an incoming packet
// syn is true if this is the first packet received. It will cut off parsing
// as soon as the header is done
size_t
utp_process_incoming(UTPSocket *conn, const byte *packet, size_t len,
                     bool syn = false)
{
  utp_register_recv_packet(conn, len);

  conn->ctx->current_ms = utp_call_get_milliseconds(conn->ctx, conn);

  const PacketFormatV1 *pf1 = (PacketFormatV1 *)packet;
  const byte *packet_end    = packet + len;

  uint16 pk_seq_nr = pf1->seq_nr;
  uint16 pk_ack_nr = pf1->ack_nr;
  uint8 pk_flags   = pf1->type();

  if(pk_flags >= ST_NUM_STATES)
    return 0;

#if UTP_DEBUG_LOGGING
  conn->log(UTP_LOG_DEBUG,
            "Got %s. seq_nr:%u ack_nr:%u state:%s timestamp:" I64u
            " reply_micro:%u",
            flagnames[pk_flags], pk_seq_nr, pk_ack_nr, statenames[conn->state],
            uint64(pf1->tv_usec), (uint32)(pf1->reply_micro));
#endif

  // mark receipt time
  uint64 time = utp_call_get_microseconds(conn->ctx, conn);

  // window packets size is used to calculate a minimum
  // permissible range for received acks. connections with acks falling
  // out of this range are dropped
  const uint16 curr_window = max< uint16 >(
      conn->cur_window_packets + ACK_NR_ALLOWED_WINDOW, ACK_NR_ALLOWED_WINDOW);

  // ignore packets whose ack_nr is invalid. This would imply a spoofed address
  // or a malicious attempt to attach the uTP implementation.
  // acking a packet that hasn't been sent yet!
  // SYN packets have an exception, since there are no previous packets
  if((pk_flags != ST_SYN || conn->state != CS_SYN_RECV)
     && (wrapping_compare_less(conn->seq_nr - 1, pk_ack_nr, ACK_NR_MASK)
         || wrapping_compare_less(pk_ack_nr, conn->seq_nr - 1 - curr_window,
                                  ACK_NR_MASK)))
  {
#if UTP_DEBUG_LOGGING
    conn->log(UTP_LOG_DEBUG,
              "Invalid ack_nr: %u. our seq_nr: %u last unacked: %u", pk_ack_nr,
              conn->seq_nr,
              (conn->seq_nr - conn->cur_window_packets) & ACK_NR_MASK);
#endif
    return 0;
  }

  // RSTs are handled earlier, since the connid matches the send id not the recv
  // id
  assert(pk_flags != ST_RESET);

  // TODO: maybe send a ST_RESET if we're in CS_RESET?

  const byte *selack_ptr = NULL;

  // Unpack UTP packet options
  // Data pointer
  const byte *data = (const byte *)pf1 + conn->get_header_size();
  if(conn->get_header_size() > len)
  {
#if UTP_DEBUG_LOGGING
    conn->log(UTP_LOG_DEBUG, "Invalid packet size (less than header size)");
#endif

    return 0;
  }
  // Skip the extension headers
  uint extension = pf1->ext;
  if(extension != 0)
  {
    do
    {
      // Verify that the packet is valid.
      data += 2;

      if((int)(packet_end - data) < 0 || (int)(packet_end - data) < data[-1])
      {
#if UTP_DEBUG_LOGGING
        conn->log(UTP_LOG_DEBUG, "Invalid len of extensions");
#endif

        return 0;
      }

      switch(extension)
      {
        case 1:  // Selective Acknowledgment
          selack_ptr = data;
          break;
        case 2:  // extension bits
          if(data[-1] != 8)
          {
#if UTP_DEBUG_LOGGING
            conn->log(UTP_LOG_DEBUG, "Invalid len of extension bits header");
#endif

            return 0;
          }
          memcpy(conn->extensions, data, 8);

#if UTP_DEBUG_LOGGING
          conn->log(UTP_LOG_DEBUG,
                    "got extension bits:%02x%02x%02x%02x%02x%02x%02x%02x",
                    conn->extensions[0], conn->extensions[1],
                    conn->extensions[2], conn->extensions[3],
                    conn->extensions[4], conn->extensions[5],
                    conn->extensions[6], conn->extensions[7]);
#endif
      }
      extension = data[-2];
      data += data[-1];
    } while(extension);
  }

  if(conn->state == CS_SYN_SENT)
  {
    // if this is a syn-ack, initialize our ack_nr
    // to match the sequence number we got from
    // the other end
    conn->ack_nr = (pk_seq_nr - 1) & SEQ_NR_MASK;
  }

  conn->last_got_packet = conn->ctx->current_ms;

  if(syn)
  {
    return 0;
  }

  // seqnr is the number of packets past the expected
  // packet this is. ack_nr is the last acked, seq_nr is the
  // current. Subtracring 1 makes 0 mean "this is the next
  // expected packet".
  const uint seqnr = (pk_seq_nr - conn->ack_nr - 1) & SEQ_NR_MASK;

  // Getting an invalid sequence number?
  if(seqnr >= REORDER_BUFFER_MAX_SIZE)
  {
    if(seqnr >= (SEQ_NR_MASK + 1) - REORDER_BUFFER_MAX_SIZE
       && pk_flags != ST_STATE)
    {
      conn->schedule_ack();
    }

#if UTP_DEBUG_LOGGING
    conn->log(UTP_LOG_DEBUG, "    Got old Packet/Ack (%u/%u)=%u", pk_seq_nr,
              conn->ack_nr, seqnr);
#endif
    return 0;
  }

  // Process acknowledgment
  // acks is the number of packets that was acked
  int acks =
      (pk_ack_nr - (conn->seq_nr - 1 - conn->cur_window_packets)) & ACK_NR_MASK;

  // this happens when we receive an old ack nr
  if(acks > conn->cur_window_packets)
    acks = 0;

  // if we get the same ack_nr as in the last packet
  // increase the duplicate_ack counter, otherwise reset
  // it to 0.
  // It's important to only count ACKs in ST_STATE packets. Any other
  // packet (primarily ST_DATA) is likely to have been sent because of the
  // other end having new outgoing data, not in response to incoming data.
  // For instance, if we're receiving a steady stream of payload with no
  // outgoing data, and we suddently have a few bytes of payload to send (say,
  // a bittorrent HAVE message), we're very likely to see 3 duplicate ACKs
  // immediately after sending our payload packet. This effectively disables
  // the fast-resend on duplicate-ack logic for bi-directional connections
  // (except in the case of a selective ACK). This is in line with BSD4.4 TCP
  // implementation.
  if(conn->cur_window_packets > 0)
  {
    if(pk_ack_nr
           == ((conn->seq_nr - conn->cur_window_packets - 1) & ACK_NR_MASK)
       && conn->cur_window_packets > 0 && pk_flags == ST_STATE)
    {
      ++conn->duplicate_ack;
      if(conn->duplicate_ack == DUPLICATE_ACKS_BEFORE_RESEND
         && conn->mtu_probe_seq)
      {
        // It's likely that the probe was rejected due to its size, but we
        // haven't got an ICMP report back yet
        if(pk_ack_nr == ((conn->mtu_probe_seq - 1) & ACK_NR_MASK))
        {
          conn->mtu_ceiling = conn->mtu_probe_size - 1;
          conn->mtu_search_update();
          conn->log(UTP_LOG_MTU, "MTU [DUPACK] floor:%d ceiling:%d current:%d",
                    conn->mtu_floor, conn->mtu_ceiling, conn->mtu_last);
        }
        else
        {
          // A non-probe was blocked before our probe.
          // Can't conclude much, send a new probe
          conn->mtu_probe_seq = conn->mtu_probe_size = 0;
        }
      }
    }
    else
    {
      conn->duplicate_ack = 0;
    }

    // TODO: if duplicate_ack == DUPLICATE_ACK_BEFORE_RESEND
    // and fast_resend_seq_nr <= ack_nr + 1
    //    resend ack_nr + 1
    // also call maybe_decay_win()
  }

  // figure out how many bytes were acked
  size_t acked_bytes = 0;

  // the minimum rtt of all acks
  // this is the upper limit on the delay we get back
  // from the other peer. Our delay cannot exceed
  // the rtt of the packet. If it does, clamp it.
  // this is done in apply_ledbat_ccontrol()
  int64 min_rtt = INT64_MAX;

  uint64 now = utp_call_get_microseconds(conn->ctx, conn);

  for(int i = 0; i < acks; ++i)
  {
    int seq = (conn->seq_nr - conn->cur_window_packets + i) & ACK_NR_MASK;
    OutgoingPacket *pkt = (OutgoingPacket *)conn->outbuf.get(seq);
    if(pkt == 0 || pkt->transmissions == 0)
      continue;
    assert((int)(pkt->payload) >= 0);
    acked_bytes += pkt->payload;
    if(conn->mtu_probe_seq && seq == static_cast< int >(conn->mtu_probe_seq))
    {
      conn->mtu_floor = conn->mtu_probe_size;
      conn->mtu_search_update();
      conn->log(UTP_LOG_MTU, "MTU [ACK] floor:%d ceiling:%d current:%d",
                conn->mtu_floor, conn->mtu_ceiling, conn->mtu_last);
    }

    // in case our clock is not monotonic
    if(pkt->time_sent < now)
      min_rtt = min< int64 >(min_rtt, now - pkt->time_sent);
    else
      min_rtt = min< int64 >(min_rtt, 50000);
  }

  // count bytes acked by EACK
  if(selack_ptr != NULL)
  {
    acked_bytes += conn->selective_ack_bytes(
        (pk_ack_nr + 2) & ACK_NR_MASK, selack_ptr, selack_ptr[-1], min_rtt);
  }

#if UTP_DEBUG_LOGGING
  conn->log(UTP_LOG_DEBUG,
            "acks:%d acked_bytes:%u seq_nr:%d cur_window:%u "
            "cur_window_packets:%u relative_seqnr:%u max_window:%u min_rtt:%u "
            "rtt:%u",
            acks, (uint)acked_bytes, conn->seq_nr, (uint)conn->cur_window,
            conn->cur_window_packets, seqnr, (uint)conn->max_window,
            (uint)(min_rtt / 1000), conn->rtt);
#endif

  uint64 p = pf1->tv_usec;

  conn->last_measured_delay = conn->ctx->current_ms;

  // get delay in both directions
  // record the delay to report back
  const uint32 their_delay = (uint32)(p == 0 ? 0 : time - p);
  conn->reply_micro        = their_delay;
  uint32 prev_delay_base   = conn->their_hist.delay_base;
  if(their_delay != 0)
    conn->their_hist.add_sample(their_delay, conn->ctx->current_ms);

  // if their new delay base is less than their previous one
  // we should shift our delay base in the other direction in order
  // to take the clock skew into account
  if(prev_delay_base != 0
     && wrapping_compare_less(conn->their_hist.delay_base, prev_delay_base,
                              TIMESTAMP_MASK))
  {
    // never adjust more than 10 milliseconds
    if(prev_delay_base - conn->their_hist.delay_base <= 10000)
    {
      conn->our_hist.shift(prev_delay_base - conn->their_hist.delay_base);
    }
  }

  const uint32 actual_delay =
      (uint32(pf1->reply_micro) == INT_MAX ? 0 : uint32(pf1->reply_micro));

  // if the actual delay is 0, it means the other end
  // hasn't received a sample from us yet, and doesn't
  // know what it is. We can't update out history unless
  // we have a true measured sample
  if(actual_delay != 0)
  {
    conn->our_hist.add_sample(actual_delay, conn->ctx->current_ms);

    // this is keeping an average of the delay samples
    // we've recevied within the last 5 seconds. We sum
    // all the samples and increase the count in order to
    // calculate the average every 5 seconds. The samples
    // are based off of the average_delay_base to deal with
    // wrapping counters.
    if(conn->average_delay_base == 0)
      conn->average_delay_base = actual_delay;
    int64 average_delay_sample = 0;
    // distance walking from lhs to rhs, downwards
    const uint32 dist_down = conn->average_delay_base - actual_delay;
    // distance walking from lhs to rhs, upwards
    const uint32 dist_up = actual_delay - conn->average_delay_base;

    if(dist_down > dist_up)
    {
      //			assert(dist_up < INT_MAX / 4);
      // average_delay_base < actual_delay, we should end up
      // with a positive sample
      average_delay_sample = dist_up;
    }
    else
    {
      //			assert(-int64(dist_down) < INT_MAX / 4);
      // average_delay_base >= actual_delay, we should end up
      // with a negative sample
      average_delay_sample = -int64(dist_down);
    }
    conn->current_delay_sum += average_delay_sample;
    ++conn->current_delay_samples;

    if(conn->ctx->current_ms > conn->average_sample_time)
    {
      int32 prev_average_delay = conn->average_delay;

      assert(conn->current_delay_sum / conn->current_delay_samples < INT_MAX);
      assert(conn->current_delay_sum / conn->current_delay_samples > -INT_MAX);
      // write the new average
      conn->average_delay =
          (int32)(conn->current_delay_sum / conn->current_delay_samples);
      // each slot represents 5 seconds
      conn->average_sample_time += 5000;

      conn->current_delay_sum     = 0;
      conn->current_delay_samples = 0;

      // this makes things very confusing when logging the average delay
      //#if !g_log_utp
      // normalize the average samples
      // since we're only interested in the slope
      // of the curve formed by the average delay samples,
      // we can cancel out the actual offset to make sure
      // we won't have problems with wrapping.
      int min_sample = min(prev_average_delay, conn->average_delay);
      int max_sample = max(prev_average_delay, conn->average_delay);

      // normalize around zero. Try to keep the min <= 0 and max >= 0
      int adjust = 0;
      if(min_sample > 0)
      {
        // adjust all samples (and the baseline) down by min_sample
        adjust = -min_sample;
      }
      else if(max_sample < 0)
      {
        // adjust all samples (and the baseline) up by -max_sample
        adjust = -max_sample;
      }
      if(adjust)
      {
        conn->average_delay_base -= adjust;
        conn->average_delay += adjust;
        prev_average_delay += adjust;
      }
      //#endif

      // update the clock drift estimate
      // the unit is microseconds per 5 seconds
      // what we're doing is just calculating the average of the
      // difference between each slot. Since each slot is 5 seconds
      // and the timestamps unit are microseconds, we'll end up with
      // the average slope across our history. If there is a consistent
      // trend, it will show up in this value

      // int64 slope = 0;
      int32 drift = conn->average_delay - prev_average_delay;

      // clock_drift is a rolling average
      conn->clock_drift     = (int64(conn->clock_drift) * 7 + drift) / 8;
      conn->clock_drift_raw = drift;
    }
  }

  // if our new delay base is less than our previous one
  // we should shift the other end's delay base in the other
  // direction in order to take the clock skew into account
  // This is commented out because it creates bad interactions
  // with our adjustment in the other direction. We don't really
  // need our estimates of the other peer to be very accurate
  // anyway. The problem with shifting here is that we're more
  // likely shift it back later because of a low latency. This
  // second shift back would cause us to shift our delay base
  // which then get's into a death spiral of shifting delay bases
  /*	if (prev_delay_base != 0 &&
                  wrapping_compare_less(conn->our_hist.delay_base,
     prev_delay_base)) {
                  // never adjust more than 10 milliseconds
                  if (prev_delay_base - conn->our_hist.delay_base <= 10000) {
                          conn->their_hist.Shift(prev_delay_base -
     conn->our_hist.delay_base);
                  }
          }
  */

  // if the delay estimate exceeds the RTT, adjust the base_delay to
  // compensate
  assert(min_rtt >= 0);
  if(int64(conn->our_hist.get_value()) > min_rtt)
  {
    conn->our_hist.shift((uint32)(conn->our_hist.get_value() - min_rtt));
  }

  // only apply the congestion controller on acks
  // if we don't have a delay measurement, there's
  // no point in invoking the congestion control
  if(actual_delay != 0 && acked_bytes >= 1)
    conn->apply_ccontrol(acked_bytes, actual_delay, min_rtt);

  // sanity check, the other end should never ack packets
  // past the point we've sent
  if(acks <= conn->cur_window_packets)
  {
    conn->max_window_user = pf1->windowsize;

    // If max user window is set to 0, then we startup a timer
    // That will reset it to 1 after 15 seconds.
    if(conn->max_window_user == 0)
      // Reset max_window_user to 1 every 15 seconds.
      conn->zerowindow_time = conn->ctx->current_ms + 15000;

    // Respond to connect message
    // Switch to CONNECTED state.
    // If this is an ack and we're in still handshaking
    // transition over to the connected state.

    // Incoming connection completion
    if(pk_flags == ST_DATA && conn->state == CS_SYN_RECV)
    {
      conn->state = CS_CONNECTED;
    }

    // Outgoing connection completion
    if(pk_flags == ST_STATE && conn->state == CS_SYN_SENT)
    {
      conn->state = CS_CONNECTED;

      // If the user has defined the ON_CONNECT callback, use that to
      // notify the user that the socket is now connected.  If ON_CONNECT
      // has not been defined, notify the user via ON_STATE_CHANGE.
      if(conn->ctx->callbacks[UTP_ON_CONNECT])
        utp_call_on_connect(conn->ctx, conn);
      else
        utp_call_on_state_change(conn->ctx, conn, UTP_STATE_CONNECT);

      // We've sent a fin, and everything was ACKed (including the FIN).
      // cur_window_packets == acks means that this packet acked all
      // the remaining packets that were in-flight.
    }
    else if(conn->fin_sent && conn->cur_window_packets == acks)
    {
      conn->fin_sent_acked = true;
      if(conn->close_requested)
      {
        conn->state = CS_DESTROY;
      }
    }

    // Update fast resend counter
    if(wrapping_compare_less(conn->fast_resend_seq_nr,
                             (pk_ack_nr + 1) & ACK_NR_MASK, ACK_NR_MASK))
      conn->fast_resend_seq_nr = (pk_ack_nr + 1) & ACK_NR_MASK;

#if UTP_DEBUG_LOGGING
    conn->log(UTP_LOG_DEBUG, "fast_resend_seq_nr:%u", conn->fast_resend_seq_nr);
#endif

    for(int i = 0; i < acks; ++i)
    {
      int ack_status =
          conn->ack_packet(conn->seq_nr - conn->cur_window_packets);
      // if ack_status is 0, the packet was acked.
      // if acl_stauts is 1, it means that the packet had already been acked
      // if it's 2, the packet has not been sent yet
      // We need to break this loop in the latter case. This could potentially
      // happen if we get an ack_nr that does not exceed what we have stuffed
      // into the outgoing buffer, but does exceed what we have sent
      if(ack_status == 2)
      {
#ifdef _DEBUG
        OutgoingPacket *pkt = (OutgoingPacket *)conn->outbuf.get(
            conn->seq_nr - conn->cur_window_packets);
        assert(pkt->transmissions == 0);
#endif

        break;
      }
      conn->cur_window_packets--;

#if UTP_DEBUG_LOGGING
      conn->log(UTP_LOG_DEBUG, "decementing cur_window_packets:%u",
                conn->cur_window_packets);
#endif
    }

#ifdef _DEBUG
    if(conn->cur_window_packets == 0)
      assert(conn->cur_window == 0);
#endif

    // packets in front of this may have been acked by a
    // selective ack (EACK). Keep decreasing the window packet size
    // until we hit a packet that is still waiting to be acked
    // in the send queue
    // this is especially likely to happen when the other end
    // has the EACK send bug older versions of uTP had
    while(conn->cur_window_packets > 0
          && !conn->outbuf.get(conn->seq_nr - conn->cur_window_packets))
    {
      conn->cur_window_packets--;

#if UTP_DEBUG_LOGGING
      conn->log(UTP_LOG_DEBUG, "decementing cur_window_packets:%u",
                conn->cur_window_packets);
#endif
    }

#ifdef _DEBUG
    if(conn->cur_window_packets == 0)
      assert(conn->cur_window == 0);
#endif

    // this invariant should always be true
    assert(conn->cur_window_packets == 0
           || conn->outbuf.get(conn->seq_nr - conn->cur_window_packets));

    // flush Nagle
    if(conn->cur_window_packets == 1)
    {
      OutgoingPacket *pkt =
          (OutgoingPacket *)conn->outbuf.get(conn->seq_nr - 1);
      // do we still have quota?
      if(pkt->transmissions == 0)
      {
        conn->send_packet(pkt);
      }
    }

    // Fast timeout-retry
    if(conn->fast_timeout)
    {
#if UTP_DEBUG_LOGGING
      conn->log(UTP_LOG_DEBUG, "Fast timeout %u,%u,%u?", (uint)conn->cur_window,
                conn->seq_nr - conn->timeout_seq_nr, conn->timeout_seq_nr);
#endif

      // if the fast_resend_seq_nr is not pointing to the oldest outstanding
      // packet, it suggests that we've already resent the packet that timed
      // out, and we should leave the fast-timeout mode.
      if(((conn->seq_nr - conn->cur_window_packets) & ACK_NR_MASK)
         != conn->fast_resend_seq_nr)
      {
        conn->fast_timeout = false;
      }
      else
      {
        // resend the oldest packet and increment fast_resend_seq_nr
        // to not allow another fast resend on it again
        OutgoingPacket *pkt = (OutgoingPacket *)conn->outbuf.get(
            conn->seq_nr - conn->cur_window_packets);
        if(pkt && pkt->transmissions > 0)
        {
#if UTP_DEBUG_LOGGING
          conn->log(UTP_LOG_DEBUG, "Packet %u fast timeout-retry.",
                    conn->seq_nr - conn->cur_window_packets);
#endif

#ifdef _DEBUG
          ++conn->_stats.fastrexmit;
#endif

          conn->fast_resend_seq_nr++;
          conn->send_packet(pkt);
        }
      }
    }
  }

  // Process selective acknowledgent
  if(selack_ptr != NULL)
  {
    conn->selective_ack(pk_ack_nr + 2, selack_ptr, selack_ptr[-1]);
  }

  // this invariant should always be true
  assert(conn->cur_window_packets == 0
         || conn->outbuf.get(conn->seq_nr - conn->cur_window_packets));

#if UTP_DEBUG_LOGGING
  conn->log(UTP_LOG_DEBUG,
            "acks:%d acked_bytes:%u seq_nr:%u cur_window:%u "
            "cur_window_packets:%u ",
            acks, (uint)acked_bytes, conn->seq_nr, (uint)conn->cur_window,
            conn->cur_window_packets);
#endif

  // In case the ack dropped the current window below
  // the max_window size, Mark the socket as writable
  if(conn->state == CS_CONNECTED_FULL && !conn->is_full())
  {
    conn->state = CS_CONNECTED;
#if UTP_DEBUG_LOGGING
    conn->log(UTP_LOG_DEBUG,
              "Socket writable. max_window:%u cur_window:%u packet_size:%u",
              (uint)conn->max_window, (uint)conn->cur_window,
              (uint)conn->get_packet_size());
#endif
    utp_call_on_state_change(conn->ctx, conn, UTP_STATE_WRITABLE);
  }

  if(pk_flags == ST_STATE)
  {
    // This is a state packet only.
    return 0;
  }

  // The connection is not in a state that can accept data?
  if(conn->state != CS_CONNECTED && conn->state != CS_CONNECTED_FULL)
  {
    return 0;
  }

  // Is this a finalize packet?
  if(pk_flags == ST_FIN && !conn->got_fin)
  {
#if UTP_DEBUG_LOGGING
    conn->log(UTP_LOG_DEBUG, "Got FIN eof_pkt:%u", pk_seq_nr);
#endif

    conn->got_fin = true;
    conn->eof_pkt = pk_seq_nr;
    // at this point, it is possible for the
    // other end to have sent packets with
    // sequence numbers higher than seq_nr.
    // if this is the case, our reorder_count
    // is out of sync. This case is dealt with
    // when we re-order and hit the eof_pkt.
    // we'll just ignore any packets with
    // sequence numbers past this
  }

  // Getting an in-order packet?
  if(seqnr == 0)
  {
    size_t count = packet_end - data;
    if(count > 0 && !conn->read_shutdown)
    {
#if UTP_DEBUG_LOGGING
      conn->log(UTP_LOG_DEBUG, "Got Data len:%u (rb:%u)", (uint)count,
                (uint)utp_call_get_read_buffer_size(conn->ctx, conn));
#endif

      // Post bytes to the upper layer
      utp_call_on_read(conn->ctx, conn, data, count);
    }
    conn->ack_nr++;

    // Check if the next packet has been received too, but waiting
    // in the reorder buffer.
    for(;;)
    {
      if(!conn->got_fin_reached && conn->got_fin
         && conn->eof_pkt == conn->ack_nr)
      {
        conn->got_fin_reached = true;
        conn->rto_timeout =
            conn->ctx->current_ms + min< uint >(conn->rto * 3, 60);

#if UTP_DEBUG_LOGGING
        conn->log(UTP_LOG_DEBUG, "Posting EOF");
#endif

        utp_call_on_state_change(conn->ctx, conn, UTP_STATE_EOF);

        // if the other end wants to close, ack
        conn->send_ack();

        // reorder_count is not necessarily 0 at this point.
        // even though it is most of the time, the other end
        // may have sent packets with higher sequence numbers
        // than what later end up being eof_pkt
        // since we have received all packets up to eof_pkt
        // just ignore the ones after it.
        conn->reorder_count = 0;
      }

      // Quick get-out in case there is nothing to reorder
      if(conn->reorder_count == 0)
        break;

      // Check if there are additional buffers in the reorder buffers
      // that need delivery.
      byte *p = (byte *)conn->inbuf.get(conn->ack_nr + 1);
      if(p == NULL)
        break;
      conn->inbuf.put(conn->ack_nr + 1, NULL);
      count = *(uint *)p;
      if(count > 0 && !conn->read_shutdown)
      {
        // Pass the bytes to the upper layer
        utp_call_on_read(conn->ctx, conn, p + sizeof(uint), count);
      }
      conn->ack_nr++;

      // Free the element from the reorder buffer
      free(p);
      assert(conn->reorder_count > 0);
      conn->reorder_count--;
    }

    conn->schedule_ack();
  }
  else
  {
    // Getting an out of order packet.
    // The packet needs to be remembered and rearranged later.

    // if we have received a FIN packet, and the EOF-sequence number
    // is lower than the sequence number of the packet we just received
    // something is wrong.
    if(conn->got_fin && pk_seq_nr > conn->eof_pkt)
    {
#if UTP_DEBUG_LOGGING
      conn->log(UTP_LOG_DEBUG,
                "Got an invalid packet sequence number, past EOF "
                "reorder_count:%u len:%u (rb:%u)",
                conn->reorder_count, (uint)(packet_end - data),
                (uint)utp_call_get_read_buffer_size(conn->ctx, conn));
#endif
      return 0;
    }

    // if the sequence number is entirely off the expected
    // one, just drop it. We can't allocate buffer space in
    // the inbuf entirely based on untrusted input
    if(seqnr > 0x3ff)
    {
#if UTP_DEBUG_LOGGING
      conn->log(UTP_LOG_DEBUG,
                "0x%08x: Got an invalid packet sequence number, too far off "
                "reorder_count:%u len:%u (rb:%u)",
                conn->reorder_count, (uint)(packet_end - data),
                (uint)utp_call_get_read_buffer_size(conn->ctx, conn));
#endif
      return 0;
    }

    // we need to grow the circle buffer before we
    // check if the packet is already in here, so that
    // we don't end up looking at an older packet (since
    // the indices wraps around).
    conn->inbuf.ensure_size(pk_seq_nr + 1, seqnr + 1);

    // Has this packet already been received? (i.e. a duplicate)
    // If that is the case, just discard it.
    if(conn->inbuf.get(pk_seq_nr) != NULL)
    {
#ifdef _DEBUG
      ++conn->_stats.nduprecv;
#endif

      return 0;
    }

    // Allocate memory to fit the packet that needs to re-ordered
    byte *mem    = (byte *)malloc((packet_end - data) + sizeof(uint));
    *(uint *)mem = (uint)(packet_end - data);
    memcpy(mem + sizeof(uint), data, packet_end - data);

    // Insert into reorder buffer and increment the count
    // of # of packets to be reordered.
    // we add one to seqnr in order to leave the last
    // entry empty, that way the assert in send_ack
    // is valid. we have to add one to seqnr too, in order
    // to make the circular buffer grow around the correct
    // point (which is conn->ack_nr + 1).
    assert(conn->inbuf.get(pk_seq_nr) == NULL);
    assert((pk_seq_nr & conn->inbuf.mask)
           != ((conn->ack_nr + 1) & conn->inbuf.mask));
    conn->inbuf.put(pk_seq_nr, mem);
    conn->reorder_count++;

#if UTP_DEBUG_LOGGING
    conn->log(UTP_LOG_DEBUG,
              "0x%08x: Got out of order data reorder_count:%u len:%u (rb:%u)",
              conn->reorder_count, (uint)(packet_end - data),
              (uint)utp_call_get_read_buffer_size(conn->ctx, conn));
#endif

    conn->schedule_ack();
  }

  return (size_t)(packet_end - data);
}

inline byte
UTP_Version(PacketFormatV1 const *pf)
{
  return (pf->type() < ST_NUM_STATES && pf->ext < 3 ? pf->version() : 0);
}

UTPSocket::~UTPSocket()
{
#if UTP_DEBUG_LOGGING
  log(UTP_LOG_DEBUG, "Killing socket");
#endif

  utp_call_on_state_change(ctx, this, UTP_STATE_DESTROYING);

  if(ctx->last_utp_socket == this)
  {
    ctx->last_utp_socket = NULL;
  }

  // Remove object from the global hash table
  UTPSocketKeyData *kd =
      ctx->utp_sockets->Delete(UTPSocketKey(addr, conn_id_recv));
  assert(kd);
  (void)kd;
  // remove the socket from ack_sockets if it was there also
  removeSocketFromAckList(this);

  // Free all memory occupied by the socket object.
  for(size_t i = 0; i <= inbuf.mask; i++)
  {
    free(inbuf.elements[i]);
  }
  for(size_t i = 0; i <= outbuf.mask; i++)
  {
    free(outbuf.elements[i]);
  }
  // TODO: The circular buffer should have a destructor
  free(inbuf.elements);
  free(outbuf.elements);
}

void
UTP_FreeAll(struct UTPSocketHT *utp_sockets)
{
  utp_hash_iterator_t it;
  UTPSocketKeyData *keyData;
  while((keyData = utp_sockets->Iterate(it)))
  {
    delete keyData->socket;
  }
}

void
utp_initialize_socket(utp_socket *conn, const struct sockaddr *addr,
                      socklen_t addrlen, bool need_seed_gen, uint32 conn_seed,
                      uint32 conn_id_recv, uint32 conn_id_send)
{
  PackedSockAddr psaddr =
      PackedSockAddr((const SOCKADDR_STORAGE *)addr, addrlen);

  if(need_seed_gen)
  {
    do
    {
      conn_seed = utp_call_get_random(conn->ctx, conn);
      // we identify v1 and higher by setting the first two bytes to 0x0001
      conn_seed &= 0xffff;
    } while(conn->ctx->utp_sockets->Lookup(UTPSocketKey(psaddr, conn_seed)));

    conn_id_recv += conn_seed;
    conn_id_send += conn_seed;
  }

  conn->state               = CS_IDLE;
  conn->conn_seed           = conn_seed;
  conn->conn_id_recv        = conn_id_recv;
  conn->conn_id_send        = conn_id_send;
  conn->addr                = psaddr;
  conn->ctx->current_ms     = utp_call_get_milliseconds(conn->ctx, NULL);
  conn->last_got_packet     = conn->ctx->current_ms;
  conn->last_sent_packet    = conn->ctx->current_ms;
  conn->last_measured_delay = conn->ctx->current_ms + 0x70000000;
  conn->average_sample_time = conn->ctx->current_ms + 5000;
  conn->last_rwin_decay     = conn->ctx->current_ms - MAX_WINDOW_DECAY;

  conn->our_hist.clear(conn->ctx->current_ms);
  conn->their_hist.clear(conn->ctx->current_ms);
  conn->rtt_hist.clear(conn->ctx->current_ms);

  // initialize MTU floor and ceiling
  conn->mtu_reset();
  conn->mtu_last = conn->mtu_ceiling;

  conn->ctx->utp_sockets->Add(UTPSocketKey(conn->addr, conn->conn_id_recv))
      ->socket = conn;

  // we need to fit one packet in the window when we start the connection
  conn->max_window = conn->get_packet_size();

#if UTP_DEBUG_LOGGING
  conn->log(UTP_LOG_DEBUG, "UTP socket initialized");
#endif
}

utp_socket *
utp_create_socket(utp_context *ctx)
{
  assert(ctx);
  if(!ctx)
    return NULL;

  UTPSocket *conn = new UTPSocket;  // TODO: UTPSocket should have a constructor

  conn->state                 = CS_UNINITIALIZED;
  conn->ctx                   = ctx;
  conn->userdata              = NULL;
  conn->reorder_count         = 0;
  conn->duplicate_ack         = 0;
  conn->timeout_seq_nr        = 0;
  conn->last_rcv_win          = 0;
  conn->got_fin               = false;
  conn->got_fin_reached       = false;
  conn->fin_sent              = false;
  conn->fin_sent_acked        = false;
  conn->read_shutdown         = false;
  conn->close_requested       = false;
  conn->fast_timeout          = false;
  conn->rtt                   = 0;
  conn->retransmit_timeout    = 0;
  conn->rto_timeout           = 0;
  conn->zerowindow_time       = 0;
  conn->average_delay         = 0;
  conn->current_delay_samples = 0;
  conn->cur_window            = 0;
  conn->eof_pkt               = 0;
  conn->last_maxed_out_window = 0;
  conn->mtu_probe_seq         = 0;
  conn->mtu_probe_size        = 0;
  conn->current_delay_sum     = 0;
  conn->average_delay_base    = 0;
  conn->retransmit_count      = 0;
  conn->rto                   = 3000;
  conn->rtt_var               = 800;
  conn->seq_nr                = 1;
  conn->ack_nr                = 0;
  conn->max_window_user       = 255 * PACKET_SIZE;
  conn->cur_window_packets    = 0;
  conn->fast_resend_seq_nr    = conn->seq_nr;
  conn->target_delay          = ctx->target_delay;
  conn->reply_micro           = 0;
  conn->opt_sndbuf            = ctx->opt_sndbuf;
  conn->opt_rcvbuf            = ctx->opt_rcvbuf;
  conn->slow_start            = true;
  conn->ssthresh              = conn->opt_sndbuf;
  conn->clock_drift           = 0;
  conn->clock_drift_raw       = 0;
  conn->outbuf.mask           = 15;
  conn->inbuf.mask            = 15;
  conn->outbuf.elements       = (void **)calloc(16, sizeof(void *));
  conn->inbuf.elements        = (void **)calloc(16, sizeof(void *));
  conn->ida = -1;  // set the index of every new socket in ack_sockets to
                   // -1, which also means it is not in ack_sockets yet

  memset(conn->extensions, 0, sizeof(conn->extensions));

#ifdef _DEBUG
  memset(&conn->_stats, 0, sizeof(utp_socket_stats));
#endif

  return conn;
}

int
utp_context_set_option(utp_context *ctx, int opt, int val)
{
  assert(ctx);
  if(!ctx)
    return -1;

  switch(opt)
  {
    case UTP_LOG_NORMAL:
      ctx->log_normal = val ? true : false;
      return 0;

    case UTP_LOG_MTU:
      ctx->log_mtu = val ? true : false;
      return 0;

    case UTP_LOG_DEBUG:
      ctx->log_debug = val ? true : false;
      return 0;

    case UTP_TARGET_DELAY:
      ctx->target_delay = val;
      return 0;

    case UTP_SNDBUF:
      assert(val >= 1);
      ctx->opt_sndbuf = val;
      return 0;

    case UTP_RCVBUF:
      assert(val >= 1);
      ctx->opt_rcvbuf = val;
      return 0;
  }
  return -1;
}

int
utp_context_get_option(utp_context *ctx, int opt)
{
  assert(ctx);
  if(!ctx)
    return -1;

  switch(opt)
  {
    case UTP_LOG_NORMAL:
      return ctx->log_normal ? 1 : 0;
    case UTP_LOG_MTU:
      return ctx->log_mtu ? 1 : 0;
    case UTP_LOG_DEBUG:
      return ctx->log_debug ? 1 : 0;
    case UTP_TARGET_DELAY:
      return ctx->target_delay;
    case UTP_SNDBUF:
      return ctx->opt_sndbuf;
    case UTP_RCVBUF:
      return ctx->opt_rcvbuf;
  }
  return -1;
}

int
utp_setsockopt(UTPSocket *conn, int opt, int val)
{
  assert(conn);
  if(!conn)
    return -1;

  switch(opt)
  {
    case UTP_SNDBUF:
      assert(val >= 1);
      conn->opt_sndbuf = val;
      return 0;

    case UTP_RCVBUF:
      assert(val >= 1);
      conn->opt_rcvbuf = val;
      return 0;

    case UTP_TARGET_DELAY:
      conn->target_delay = val;
      return 0;
  }

  return -1;
}

int
utp_getsockopt(UTPSocket *conn, int opt)
{
  assert(conn);
  if(!conn)
    return -1;

  switch(opt)
  {
    case UTP_SNDBUF:
      return conn->opt_sndbuf;
    case UTP_RCVBUF:
      return conn->opt_rcvbuf;
    case UTP_TARGET_DELAY:
      return conn->target_delay;
  }

  return -1;
}

// Try to connect to a specified host.
int
utp_connect(utp_socket *conn, const struct sockaddr *to, socklen_t tolen)
{
  assert(conn);
  if(!conn)
    return -1;

  assert(conn->state == CS_UNINITIALIZED);
  if(conn->state != CS_UNINITIALIZED)
  {
    conn->state = CS_DESTROY;
    return -1;
  }

  utp_initialize_socket(conn, to, tolen, true, 0, 0, 1);

  assert(conn->cur_window_packets == 0);
  assert(conn->outbuf.get(conn->seq_nr) == NULL);
  assert(sizeof(PacketFormatV1) == 20);

  conn->state           = CS_SYN_SENT;
  conn->ctx->current_ms = utp_call_get_milliseconds(conn->ctx, conn);

  // Create and send a connect message

  // used in parse_log.py
  conn->log(UTP_LOG_NORMAL,
            "UTP_Connect conn_seed:%u packet_size:%u (B) "
            "target_delay:%u (ms) delay_history:%u "
            "delay_base_history:%u (minutes)",
            conn->conn_seed, PACKET_SIZE,
            static_cast< unsigned int >(conn->target_delay / 1000),
            CUR_DELAY_SIZE, DELAY_BASE_HISTORY);

  // Setup initial timeout timer.
  conn->retransmit_timeout = 3000;
  conn->rto_timeout        = conn->ctx->current_ms + conn->retransmit_timeout;
  conn->last_rcv_win       = conn->get_rcv_window();

  // if you need compatibiltiy with 1.8.1, use this. it increases attackability
  // though.
  // conn->seq_nr = 1;
  conn->seq_nr = utp_call_get_random(conn->ctx, conn);

  // Create the connect packet.
  const size_t header_size = sizeof(PacketFormatV1);

  OutgoingPacket *pkt =
      (OutgoingPacket *)malloc(sizeof(OutgoingPacket) - 1 + header_size);
  PacketFormatV1 *p1 = (PacketFormatV1 *)pkt->data;

  memset(p1, 0, header_size);
  // SYN packets are special, and have the receive ID in the connid field,
  // instead of conn_id_send.
  p1->set_version(1);
  p1->set_type(ST_SYN);
  p1->ext            = 0;
  p1->connid         = conn->conn_id_recv;
  p1->windowsize     = (uint32)conn->last_rcv_win;
  p1->seq_nr         = conn->seq_nr;
  pkt->transmissions = 0;
  pkt->length        = header_size;
  pkt->payload       = 0;

  /*
  #if UTP_DEBUG_LOGGING
  conn->log(UTP_LOG_DEBUG, "Sending connect %s [%u].",
                  addrfmt(conn->addr, addrbuf), conn_seed);
  #endif
  */

  // Remember the message in the outgoing queue.
  conn->outbuf.ensure_size(conn->seq_nr, conn->cur_window_packets);
  conn->outbuf.put(conn->seq_nr, pkt);
  conn->seq_nr++;
  conn->cur_window_packets++;

#if UTP_DEBUG_LOGGING
  conn->log(UTP_LOG_DEBUG, "incrementing cur_window_packets:%u",
            conn->cur_window_packets);
#endif

  conn->send_packet(pkt);
  return 0;
}

// Returns 1 if the UDP payload was recognized as a UTP packet, or 0 if it was
// not
int
utp_process_udp(utp_context *ctx, const byte *buffer, size_t len,
                const struct sockaddr *to, socklen_t tolen)
{
  assert(ctx);
  if(!ctx)
    return 0;

  assert(buffer);
  if(!buffer)
    return 0;

  assert(to);
  if(!to)
    return 0;

  const PackedSockAddr addr((const SOCKADDR_STORAGE *)to, tolen);

  if(len < sizeof(PacketFormatV1))
  {
#if UTP_DEBUG_LOGGING
    ctx->log(UTP_LOG_DEBUG, NULL, "recv %s len:%u too small",
             addrfmt(addr, addrbuf), (uint)len);
#endif
    return 0;
  }

  const PacketFormatV1 *pf1 = (PacketFormatV1 *)buffer;
  const byte version        = UTP_Version(pf1);
  const uint32 id           = uint32(pf1->connid);

  if(version != 1)
  {
#if UTP_DEBUG_LOGGING
    ctx->log(UTP_LOG_DEBUG, NULL,
             "recv %s len:%u version:%u unsupported version",
             addrfmt(addr, addrbuf), (uint)len, version);
#endif

    return 0;
  }

#if UTP_DEBUG_LOGGING
  ctx->log(UTP_LOG_DEBUG, NULL, "recv %s len:%u id:%u", addrfmt(addr, addrbuf),
           (uint)len, id);
  ctx->log(UTP_LOG_DEBUG, NULL, "recv id:%u seq_nr:%u ack_nr:%u", id,
           (uint)pf1->seq_nr, (uint)pf1->ack_nr);
#endif

  const byte flags = pf1->type();

  if(flags == ST_RESET)
  {
    // id is either our recv id or our send id
    // if it's our send id, and we initiated the connection, our recv id is id +
    // 1 if it's our send id, and we did not initiate the connection, our recv
    // id is id - 1 we have to check every case

    UTPSocketKeyData *keyData;
    if((keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id)))
       || ((keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id + 1)))
           && keyData->socket->conn_id_send == id)
       || ((keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id - 1)))
           && keyData->socket->conn_id_send == id))
    {
      UTPSocket *conn = keyData->socket;

#if UTP_DEBUG_LOGGING
      ctx->log(UTP_LOG_DEBUG, NULL, "recv RST for existing connection");
#endif

      if(conn->close_requested)
        conn->state = CS_DESTROY;
      else
        conn->state = CS_RESET;

      utp_call_on_overhead_statistics(conn->ctx, conn, false,
                                      len + conn->get_udp_overhead(),
                                      close_overhead);
      const int err =
          (conn->state == CS_SYN_SENT) ? UTP_ECONNREFUSED : UTP_ECONNRESET;
      utp_call_on_error(conn->ctx, conn, err);
    }
    else
    {
#if UTP_DEBUG_LOGGING
      ctx->log(UTP_LOG_DEBUG, NULL, "recv RST for unknown connection");
#endif
    }
    return 1;
  }
  else if(flags != ST_SYN)
  {
    UTPSocket *conn = NULL;

    if(ctx->last_utp_socket && ctx->last_utp_socket->addr == addr
       && ctx->last_utp_socket->conn_id_recv == id)
    {
      conn = ctx->last_utp_socket;
    }
    else
    {
      UTPSocketKeyData *keyData =
          ctx->utp_sockets->Lookup(UTPSocketKey(addr, id));
      if(keyData)
      {
        conn                 = keyData->socket;
        ctx->last_utp_socket = conn;
      }
    }

    if(conn)
    {
#if UTP_DEBUG_LOGGING
      ctx->log(UTP_LOG_DEBUG, NULL, "recv processing");
#endif

      const size_t read = utp_process_incoming(conn, buffer, len);
      utp_call_on_overhead_statistics(conn->ctx, conn, false,
                                      (len - read) + conn->get_udp_overhead(),
                                      header_overhead);
      return 1;
    }
  }

  // We have not found a matching utp_socket, and this isn't a SYN.  Reject it.
  const uint32 seq_nr = pf1->seq_nr;
  if(flags != ST_SYN)
  {
    ctx->current_ms = utp_call_get_milliseconds(ctx, NULL);

    for(size_t i = 0; i < ctx->rst_info.GetCount(); i++)
    {
      if((ctx->rst_info[i].connid == id) && (ctx->rst_info[i].addr == addr)
         && (ctx->rst_info[i].ack_nr == seq_nr))
      {
        ctx->rst_info[i].timestamp = ctx->current_ms;

#if UTP_DEBUG_LOGGING
        ctx->log(UTP_LOG_DEBUG, NULL,
                 "recv not sending RST to non-SYN (stored)");
#endif

        return 1;
      }
    }

    if(ctx->rst_info.GetCount() > RST_INFO_LIMIT)
    {
#if UTP_DEBUG_LOGGING
      ctx->log(UTP_LOG_DEBUG, NULL,
               "recv not sending RST to non-SYN (limit at %u stored)",
               (uint)ctx->rst_info.GetCount());
#endif

      return 1;
    }

#if UTP_DEBUG_LOGGING
    ctx->log(UTP_LOG_DEBUG, NULL, "recv send RST to non-SYN (%u stored)",
             (uint)ctx->rst_info.GetCount());
#endif

    RST_Info &r = ctx->rst_info.Append();
    r.addr      = addr;
    r.connid    = id;
    r.ack_nr    = seq_nr;
    r.timestamp = ctx->current_ms;

    UTPSocket::send_rst(ctx, addr, id, seq_nr, utp_call_get_random(ctx, NULL));
    return 1;
  }

  if(ctx->callbacks[UTP_ON_ACCEPT])
  {
#if UTP_DEBUG_LOGGING
    ctx->log(UTP_LOG_DEBUG, NULL, "Incoming connection from %s",
             addrfmt(addr, addrbuf));
#endif

    UTPSocketKeyData *keyData =
        ctx->utp_sockets->Lookup(UTPSocketKey(addr, id + 1));
    if(keyData)
    {
#if UTP_DEBUG_LOGGING
      ctx->log(UTP_LOG_DEBUG, NULL,
               "rejected incoming connection, connection already exists");
#endif

      return 1;
    }
    /*
    if(ctx->utp_sockets->GetCount() > 3000)
    {
#if UTP_DEBUG_LOGGING
      ctx->log(UTP_LOG_DEBUG, NULL,
               "rejected incoming connection, too many uTP sockets %d",
               ctx->utp_sockets->GetCount());
#endif

      return 1;
    }
    */
    // true means yes, block connection.  false means no, don't block.
    if(utp_call_on_firewall(ctx, to, tolen))
    {
#if UTP_DEBUG_LOGGING
      ctx->log(UTP_LOG_DEBUG, NULL,
               "rejected incoming connection, firewall callback returned true");
#endif

      return 1;
    }

    // Create a new UTP socket to handle this new connection
    UTPSocket *conn = utp_create_socket(ctx);
    utp_initialize_socket(conn, to, tolen, false, id, id + 1, id);
    conn->ack_nr             = seq_nr;
    conn->seq_nr             = utp_call_get_random(ctx, NULL);
    conn->fast_resend_seq_nr = conn->seq_nr;
    conn->state              = CS_SYN_RECV;

    const size_t read = utp_process_incoming(conn, buffer, len, true);

#if UTP_DEBUG_LOGGING
    ctx->log(UTP_LOG_DEBUG, NULL, "recv send connect ACK");
#endif

    conn->send_ack(true);

    utp_call_on_accept(ctx, conn, to, tolen);

    // we report overhead after on_accept(), because the callbacks are setup now
    utp_call_on_overhead_statistics(conn->ctx, conn, false,
                                    (len - read) + conn->get_udp_overhead(),
                                    header_overhead);  // SYN
    utp_call_on_overhead_statistics(conn->ctx, conn, true, conn->get_overhead(),
                                    ack_overhead);  // SYNACK
  }
  else
  {
#if UTP_DEBUG_LOGGING
    ctx->log(UTP_LOG_DEBUG, NULL,
             "rejected incoming connection, UTP_ON_ACCEPT callback not set");
#endif
  }

  return 1;
}

// Called by utp_process_icmp_fragmentation() and utp_process_icmp_error() below
static UTPSocket *
parse_icmp_payload(utp_context *ctx, const byte *buffer, size_t len,
                   const struct sockaddr *to, socklen_t tolen)
{
  assert(ctx);
  if(!ctx)
    return NULL;

  assert(buffer);
  if(!buffer)
    return NULL;

  assert(to);
  if(!to)
    return NULL;

  const PackedSockAddr addr((const SOCKADDR_STORAGE *)to, tolen);

  // ICMP packets are only required to quote the first 8 bytes of the layer4
  // payload.  The UDP payload is 8 bytes, and the UTP header is another 20
  // bytes.  So, in order to find the entire UTP header, we need the ICMP
  // packet to quote 28 bytes.
  if(len < sizeof(PacketFormatV1))
  {
#if UTP_DEBUG_LOGGING
    ctx->log(UTP_LOG_DEBUG, NULL, "Ignoring ICMP from %s: runt length %d",
             addrfmt(addr, addrbuf), len);
#endif
    return NULL;
  }

  const PacketFormatV1 *pf = (PacketFormatV1 *)buffer;
  const byte version       = UTP_Version(pf);
  const uint32 id          = uint32(pf->connid);

  if(version != 1)
  {
#if UTP_DEBUG_LOGGING
    ctx->log(UTP_LOG_DEBUG, NULL, "Ignoring ICMP from %s: not UTP version 1",
             addrfmt(addr, addrbuf));
#endif
    return NULL;
  }

  UTPSocketKeyData *keyData;

  if((keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id)))
     || ((keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id + 1)))
         && keyData->socket->conn_id_send == id)
     || ((keyData = ctx->utp_sockets->Lookup(UTPSocketKey(addr, id - 1)))
         && keyData->socket->conn_id_send == id))
  {
    return keyData->socket;
  }

#if UTP_DEBUG_LOGGING
  ctx->log(UTP_LOG_DEBUG, NULL,
           "Ignoring ICMP from %s: No matching connection found for id %u",
           addrfmt(addr, addrbuf), id);
#endif
  return NULL;
}

// Should be called when an ICMP Type 3, Code 4 packet (fragmentation needed) is
// received, to adjust the MTU
//
// Returns 1 if the UDP payload (delivered in the ICMP packet) was recognized as
// a UTP packet, or 0 if it was not
//
// @ctx: utp_context
// @buf: Contents of the original UDP payload, which the ICMP packet quoted.
// *Not* the ICMP packet itself.
// @len: buffer length
// @to: destination address of the original UDP pakcet
// @tolen: address length
// @next_hop_mtu:
int
utp_process_icmp_fragmentation(utp_context *ctx, const byte *buffer, size_t len,
                               const struct sockaddr *to, socklen_t tolen,
                               uint16 next_hop_mtu)
{
  UTPSocket *conn = parse_icmp_payload(ctx, buffer, len, to, tolen);
  if(!conn)
    return 0;

  // Constrain the next_hop_mtu to sane values.  It might not be initialized or
  // sent properly
  if(next_hop_mtu >= 576 && next_hop_mtu < 0x2000)
  {
    conn->mtu_ceiling = min< uint32 >(next_hop_mtu, conn->mtu_ceiling);
    conn->mtu_search_update();
    // this is something of a speecial case, where we don't set mtu_last
    // to the value in between the floor and the ceiling. We can update the
    // floor, because there might be more network segments after the one
    // that sent this ICMP with smaller MTUs. But we want to test this
    // MTU size first. If the next probe gets through, mtu_floor is updated
    conn->mtu_last = conn->mtu_ceiling;
  }
  else
  {
    // Otherwise, binary search. At this point we don't actually know
    // what size the packet that failed was, and apparently we can't
    // trust the next hop mtu either. It seems reasonably conservative
    // to just lower the ceiling. This should not happen on working networks
    // anyway.
    conn->mtu_ceiling = (conn->mtu_floor + conn->mtu_ceiling) / 2;
    conn->mtu_search_update();
  }

  conn->log(UTP_LOG_MTU, "MTU [ICMP] floor:%d ceiling:%d current:%d",
            conn->mtu_floor, conn->mtu_ceiling, conn->mtu_last);
  return 1;
}

// Should be called when an ICMP message is received that should tear down the
// connection.
//
// Returns 1 if the UDP payload (delivered in the ICMP packet) was recognized as
// a UTP packet, or 0 if it was not
//
// @ctx: utp_context
// @buf: Contents of the original UDP payload, which the ICMP packet quoted.
// *Not* the ICMP packet itself.
// @len: buffer length
// @to: destination address of the original UDP pakcet
// @tolen: address length
int
utp_process_icmp_error(utp_context *ctx, const byte *buffer, size_t len,
                       const struct sockaddr *to, socklen_t tolen)
{
  UTPSocket *conn = parse_icmp_payload(ctx, buffer, len, to, tolen);
  if(!conn)
    return 0;

  const int err =
      (conn->state == CS_SYN_SENT) ? UTP_ECONNREFUSED : UTP_ECONNRESET;
  const PackedSockAddr addr((const SOCKADDR_STORAGE *)to, tolen);

  switch(conn->state)
  {
    // Don't pass on errors for idle/closed connections
    case CS_IDLE:
#if UTP_DEBUG_LOGGING
      ctx->log(UTP_LOG_DEBUG, NULL, "ICMP from %s in state CS_IDLE, ignoring",
               addrfmt(addr, addrbuf));
#endif
      return 1;

    default:
      if(conn->close_requested)
      {
#if UTP_DEBUG_LOGGING
        ctx->log(UTP_LOG_DEBUG, NULL,
                 "ICMP from %s after close, setting state to CS_DESTROY and "
                 "causing error %d",
                 addrfmt(addr, addrbuf), err);
#endif
        conn->state = CS_DESTROY;
      }
      else
      {
#if UTP_DEBUG_LOGGING
        ctx->log(UTP_LOG_DEBUG, NULL,
                 "ICMP from %s, setting state to CS_RESET and causing error %d",
                 addrfmt(addr, addrbuf), err);
#endif
        conn->state = CS_RESET;
      }
      break;
  }

  utp_call_on_error(conn->ctx, conn, err);
  return 1;
}

// Write bytes to the UTP socket.  Returns the number of bytes written.
// 0 indicates the socket is no longer writable, -1 indicates an error
ssize_t
utp_writev(utp_socket *conn, struct utp_iovec *iovec_input, size_t num_iovecs)
{
  static utp_iovec iovec[UTP_IOV_MAX];

  assert(conn);
  if(!conn)
    return -1;

  assert(iovec_input);
  if(!iovec_input)
    return -1;

  assert(num_iovecs);
  if(!num_iovecs)
    return -1;

  if(num_iovecs > UTP_IOV_MAX)
    num_iovecs = UTP_IOV_MAX;

  memcpy(iovec, iovec_input, sizeof(struct utp_iovec) * num_iovecs);

  size_t bytes = 0;
  size_t sent  = 0;
  for(size_t i = 0; i < num_iovecs; i++)
    bytes += iovec[i].iov_len;

#if UTP_DEBUG_LOGGING
  size_t param = bytes;
#endif

  if(conn->state != CS_CONNECTED)
  {
#if UTP_DEBUG_LOGGING
    conn->log(UTP_LOG_DEBUG, "UTP_Write %u bytes = false (not CS_CONNECTED)",
              (uint)bytes);
#endif
    return 0;
  }

  if(conn->fin_sent)
  {
#if UTP_DEBUG_LOGGING
    conn->log(UTP_LOG_DEBUG, "UTP_Write %u bytes = false (fin_sent already)",
              (uint)bytes);
#endif
    return 0;
  }

  conn->ctx->current_ms = utp_call_get_milliseconds(conn->ctx, conn);

  // don't send unless it will all fit in the window
  size_t packet_size = conn->get_packet_size();
  size_t num_to_send = min< size_t >(bytes, packet_size);
  while(!conn->is_full(num_to_send))
  {
    // Send an outgoing packet.
    // Also add it to the outgoing of packets that have been sent but not ACKed.

    bytes -= num_to_send;
    sent += num_to_send;

#if UTP_DEBUG_LOGGING
    conn->log(UTP_LOG_DEBUG,
              "Sending packet. seq_nr:%u ack_nr:%u wnd:%u/%u/%u rcv_win:%u "
              "size:%u cur_window_packets:%u",
              conn->seq_nr, conn->ack_nr,
              (uint)(conn->cur_window + num_to_send), (uint)conn->max_window,
              (uint)conn->max_window_user, (uint)conn->last_rcv_win,
              num_to_send, conn->cur_window_packets);
#endif
    conn->write_outgoing_packet(num_to_send, ST_DATA, iovec, num_iovecs);
    num_to_send = min< size_t >(bytes, packet_size);

    if(num_to_send == 0)
    {
#if UTP_DEBUG_LOGGING
      conn->log(UTP_LOG_DEBUG, "UTP_Write %u bytes = true", (uint)param);
#endif
      return sent;
    }
  }

  bool full = conn->is_full();
  if(full)
  {
    // mark the socket as not being writable.
    conn->state = CS_CONNECTED_FULL;
  }

#if UTP_DEBUG_LOGGING
  conn->log(UTP_LOG_DEBUG, "UTP_Write %u bytes = %s", (uint)bytes,
            full ? "false" : "true");
#endif

  // returns whether or not the socket is still writable
  // if the congestion window is not full, we can still write to it
  // return !full;
  return sent;
}

void
utp_read_drained(utp_socket *conn)
{
  assert(conn);
  if(!conn)
    return;

  assert(conn->state != CS_UNINITIALIZED);
  if(conn->state == CS_UNINITIALIZED)
    return;

  const size_t rcvwin = conn->get_rcv_window();

  if(rcvwin > conn->last_rcv_win)
  {
    // If last window was 0 send ACK immediately, otherwise should set timer
    if(conn->last_rcv_win == 0)
    {
      conn->send_ack();
    }
    else
    {
      conn->ctx->current_ms = utp_call_get_milliseconds(conn->ctx, conn);
      conn->schedule_ack();
    }
  }
}

// Should be called each time the UDP socket is drained
void
utp_issue_deferred_acks(utp_context *ctx)
{
  assert(ctx);
  if(!ctx)
    return;

  for(size_t i = 0; i < ctx->ack_sockets.GetCount(); i++)
  {
    UTPSocket *conn = ctx->ack_sockets[i];
    conn->send_ack();
    i--;
  }
}

// Should be called every 500ms
void
utp_check_timeouts(utp_context *ctx)
{
  assert(ctx);
  if(!ctx)
    return;

  ctx->current_ms = utp_call_get_milliseconds(ctx, NULL);

  if(ctx->current_ms - ctx->last_check < TIMEOUT_CHECK_INTERVAL)
    return;

  ctx->last_check = ctx->current_ms;

  for(size_t i = 0; i < ctx->rst_info.GetCount(); i++)
  {
    if((int)(ctx->current_ms - ctx->rst_info[i].timestamp) >= RST_INFO_TIMEOUT)
    {
      ctx->rst_info.MoveUpLast(i);
      i--;
    }
  }
  if(ctx->rst_info.GetCount() != ctx->rst_info.GetAlloc())
  {
    ctx->rst_info.Compact();
  }

  utp_hash_iterator_t it;
  UTPSocketKeyData *keyData;
  while((keyData = ctx->utp_sockets->Iterate(it)))
  {
    UTPSocket *conn = keyData->socket;
    conn->check_timeouts();

    // Check if the object was deleted
    if(conn->state == CS_DESTROY)
    {
#if UTP_DEBUG_LOGGING
      conn->log(UTP_LOG_DEBUG, "Destroying");
#endif
      delete conn;
    }
  }
}

int
utp_getpeername(utp_socket *conn, struct sockaddr *addr, socklen_t *addrlen)
{
  assert(addr);
  if(!addr)
    return -1;

  assert(addrlen);
  if(!addrlen)
    return -1;

  assert(conn);
  if(!conn)
    return -1;

  assert(conn->state != CS_UNINITIALIZED);
  if(conn->state == CS_UNINITIALIZED)
    return -1;

  socklen_t len;
  const SOCKADDR_STORAGE sa = conn->addr.get_sockaddr_storage(&len);
  *addrlen                  = min(len, *addrlen);
  memcpy(addr, &sa, *addrlen);
  return 0;
}

int
utp_get_delays(UTPSocket *conn, uint32 *ours, uint32 *theirs, uint32 *age)
{
  assert(conn);
  if(!conn)
    return -1;

  assert(conn->state != CS_UNINITIALIZED);
  if(conn->state == CS_UNINITIALIZED)
  {
    if(ours)
      *ours = 0;
    if(theirs)
      *theirs = 0;
    if(age)
      *age = 0;
    return -1;
  }

  if(ours)
    *ours = conn->our_hist.get_value();
  if(theirs)
    *theirs = conn->their_hist.get_value();
  if(age)
    *age = (uint32)(conn->ctx->current_ms - conn->last_measured_delay);
  return 0;
}

// Close the UTP socket.
// It is not valid for the upper layer to refer to socket after it is closed.
// Data will keep to try being delivered after the close.
void
utp_close(UTPSocket *conn)
{
  assert(conn);
  if(!conn)
    return;

  assert(conn->state != CS_UNINITIALIZED && conn->state != CS_DESTROY);

#if UTP_DEBUG_LOGGING
  conn->log(UTP_LOG_DEBUG, "UTP_Close in state:%s", statenames[conn->state]);
#endif

  switch(conn->state)
  {
    case CS_CONNECTED:
    case CS_CONNECTED_FULL:
      conn->read_shutdown   = true;
      conn->close_requested = true;
      if(!conn->fin_sent)
      {
        conn->fin_sent = true;
        conn->write_outgoing_packet(0, ST_FIN, NULL, 0);
      }
      else if(conn->fin_sent_acked)
      {
        conn->state = CS_DESTROY;
      }
      break;

    case CS_SYN_SENT:
      conn->rto_timeout = utp_call_get_milliseconds(conn->ctx, conn)
          + min< uint >(conn->rto * 2, 60);
      // fall through
    case CS_SYN_RECV:
      // fall through
    default:
      conn->state = CS_DESTROY;
      break;
  }

#if UTP_DEBUG_LOGGING
  conn->log(UTP_LOG_DEBUG, "UTP_Close end in state:%s",
            statenames[conn->state]);
#endif
}

void
utp_shutdown(UTPSocket *conn, int how)
{
  assert(conn);
  if(!conn)
    return;

  assert(conn->state != CS_UNINITIALIZED && conn->state != CS_DESTROY);

#if UTP_DEBUG_LOGGING
  conn->log(UTP_LOG_DEBUG, "UTP_shutdown(%d) in state:%s", how,
            statenames[conn->state]);
#endif

  if(how != SHUT_WR)
  {
    conn->read_shutdown = true;
  }
  if(how != SHUT_RD)
  {
    switch(conn->state)
    {
      case CS_CONNECTED:
      case CS_CONNECTED_FULL:
        if(!conn->fin_sent)
        {
          conn->fin_sent = true;
          conn->write_outgoing_packet(0, ST_FIN, NULL, 0);
        }
        break;
      case CS_SYN_SENT:
        conn->rto_timeout = utp_call_get_milliseconds(conn->ctx, conn)
            + min< uint >(conn->rto * 2, 60);
      default:
        break;
    }
  }
}

utp_context *
utp_get_context(utp_socket *socket)
{
  assert(socket);
  return socket ? socket->ctx : NULL;
}

void *
utp_set_userdata(utp_socket *socket, void *userdata)
{
  assert(socket);
  if(socket)
    socket->userdata = userdata;
  return socket ? socket->userdata : NULL;
}

void *
utp_get_userdata(utp_socket *socket)
{
  assert(socket);
  return socket ? socket->userdata : NULL;
}

void
struct_utp_context::log(int level, utp_socket *socket, char const *fmt, ...)
{
  if(!would_log(level))
  {
    return;
  }

  va_list va;
  va_start(va, fmt);
  log_unchecked(socket, fmt, va);
  va_end(va);
}

void
struct_utp_context::log_unchecked(utp_socket *socket, char const *fmt, ...)
{
  va_list va;
  char buf[4096];

  va_start(va, fmt);
  vsnprintf(buf, 4096, fmt, va);
  buf[4095] = '\0';
  va_end(va);

  utp_call_log(this, socket, (const byte *)buf);
}

inline bool
struct_utp_context::would_log(int level)
{
  if(level == UTP_LOG_NORMAL)
    return log_normal;
  if(level == UTP_LOG_MTU)
    return log_mtu;
  if(level == UTP_LOG_DEBUG)
    return log_debug;
  return true;
}

utp_socket_stats *
utp_get_stats(utp_socket *socket)
{
#ifdef _DEBUG
  assert(socket);
  if(!socket)
    return NULL;
  socket->_stats.mtu_guess =
      socket->mtu_last ? socket->mtu_last : socket->mtu_ceiling;
  return &socket->_stats;
#else
  (void)socket;
  return NULL;
#endif
}